diff options
Diffstat (limited to 'lib/Transforms/Vectorize')
| -rw-r--r-- | lib/Transforms/Vectorize/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 84 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 2577 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 2302 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/VPlan.cpp | 557 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/VPlan.h | 1194 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/VPlanBuilder.h | 61 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/VPlanValue.h | 146 | ||||
| -rw-r--r-- | lib/Transforms/Vectorize/Vectorize.cpp | 1 | 
9 files changed, 5312 insertions, 1611 deletions
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 1aea73cd4a32..7622ed6d194f 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -3,6 +3,7 @@ add_llvm_library(LLVMVectorize    LoopVectorize.cpp    SLPVectorizer.cpp    Vectorize.cpp +  VPlan.cpp    ADDITIONAL_HEADER_DIRS    ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 9cf66382b581..dc83b6d4d292 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -1,4 +1,4 @@ -//===----- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer ----------===// +//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -6,47 +6,67 @@  // License. See LICENSE.TXT for details.  //  //===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h"  #include "llvm/ADT/MapVector.h"  #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/Triple.h" +#include "llvm/ADT/iterator_range.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemoryLocation.h"  #include "llvm/Analysis/OrderedBasicBlock.h"  #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h"  #include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h"  #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h"  #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h"  #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/Module.h"  #include "llvm/IR/Type.h" +#include "llvm/IR/User.h"  #include "llvm/IR/Value.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Transforms/Utils/Local.h"  #include "llvm/Transforms/Vectorize.h" +#include <algorithm> +#include <cassert> +#include <cstdlib> +#include <tuple> +#include <utility>  using namespace llvm;  #define DEBUG_TYPE "load-store-vectorizer" +  STATISTIC(NumVectorInstructions, "Number of vector accesses generated");  STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); -namespace { -  // FIXME: Assuming stack alignment of 4 is always good enough  static const unsigned StackAdjustedAlignment = 4; -typedef SmallVector<Instruction *, 8> InstrList; -typedef MapVector<Value *, InstrList> InstrListMap; + +namespace { + +using InstrList = SmallVector<Instruction *, 8>; +using InstrListMap = MapVector<Value *, InstrList>;  class Vectorizer {    Function &F; @@ -163,7 +183,10 @@ public:      AU.setPreservesCFG();    }  }; -} + +} // end anonymous namespace + +char LoadStoreVectorizer::ID = 0;  INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,                        "Vectorize load and Store instructions", false, false) @@ -175,8 +198,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)  INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,                      "Vectorize load and store instructions", false, false) -char LoadStoreVectorizer::ID = 0; -  Pass *llvm::createLoadStoreVectorizerPass() {    return new LoadStoreVectorizer();  } @@ -480,6 +501,10 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {          MemoryInstrs.push_back(&I);        else          ChainInstrs.push_back(&I); +    } else if (isa<IntrinsicInst>(&I) && +               cast<IntrinsicInst>(&I)->getIntrinsicID() == +                   Intrinsic::sideeffect) { +      // Ignore llvm.sideeffect calls.      } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {        DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n');        break; @@ -593,7 +618,14 @@ Vectorizer::collectInstructions(BasicBlock *BB) {        // Skip weird non-byte sizes. They probably aren't worth the effort of        // handling correctly.        unsigned TySize = DL.getTypeSizeInBits(Ty); -      if (TySize < 8) +      if ((TySize % 8) != 0) +        continue; + +      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain +      // functions are currently using an integer type for the vectorized +      // load/store, and does not support casting between the integer type and a +      // vector of pointers (e.g. i64 to <2 x i16*>) +      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())          continue;        Value *Ptr = LI->getPointerOperand(); @@ -605,7 +637,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {          continue;        // Make sure all the users of a vector are constant-index extracts. -      if (isa<VectorType>(Ty) && !all_of(LI->users(), [](const User *U) { +      if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {              const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);              return EEI && isa<ConstantInt>(EEI->getOperand(1));            })) @@ -614,7 +646,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) {        // Save the load locations.        Value *ObjPtr = GetUnderlyingObject(Ptr, DL);        LoadRefs[ObjPtr].push_back(LI); -      } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {        if (!SI->isSimple())          continue; @@ -627,19 +658,28 @@ Vectorizer::collectInstructions(BasicBlock *BB) {        if (!VectorType::isValidElementType(Ty->getScalarType()))          continue; +      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain +      // functions are currently using an integer type for the vectorized +      // load/store, and does not support casting between the integer type and a +      // vector of pointers (e.g. i64 to <2 x i16*>) +      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) +        continue; +        // Skip weird non-byte sizes. They probably aren't worth the effort of        // handling correctly.        unsigned TySize = DL.getTypeSizeInBits(Ty); -      if (TySize < 8) +      if ((TySize % 8) != 0)          continue;        Value *Ptr = SI->getPointerOperand();        unsigned AS = Ptr->getType()->getPointerAddressSpace();        unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); + +      // No point in looking at these if they're too big to vectorize.        if (TySize > VecRegSize / 2)          continue; -      if (isa<VectorType>(Ty) && !all_of(SI->users(), [](const User *U) { +      if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {              const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);              return EEI && isa<ConstantInt>(EEI->getOperand(1));            })) @@ -680,8 +720,8 @@ bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {    SmallVector<int, 16> Heads, Tails;    int ConsecutiveChain[64]; -  // Do a quadratic search on all of the given stores and find all of the pairs -  // of stores that follow each other. +  // Do a quadratic search on all of the given loads/stores and find all of the +  // pairs of loads/stores that follow each other.    for (int i = 0, e = Instrs.size(); i < e; ++i) {      ConsecutiveChain[i] = -1;      for (int j = e - 1; j >= 0; --j) { @@ -748,7 +788,7 @@ bool Vectorizer::vectorizeStoreChain(      SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {    StoreInst *S0 = cast<StoreInst>(Chain[0]); -  // If the vector has an int element, default to int for the whole load. +  // If the vector has an int element, default to int for the whole store.    Type *StoreTy;    for (Instruction *I : Chain) {      StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 012b10c8a9b0..fbcdc0df0f1c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -47,62 +47,97 @@  //===----------------------------------------------------------------------===//  #include "llvm/Transforms/Vectorize/LoopVectorize.h" +#include "VPlan.h" +#include "VPlanBuilder.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h"  #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h"  #include "llvm/ADT/Hashing.h"  #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/None.h"  #include "llvm/ADT/Optional.h"  #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/SmallSet.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h"  #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DemandedBits.h"  #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopAnalysisManager.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h"  #include "llvm/Analysis/ScalarEvolutionExpander.h"  #include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h"  #include "llvm/IR/DerivedTypes.h"  #include "llvm/IR/DiagnosticInfo.h"  #include "llvm/IR/Dominators.h"  #include "llvm/IR/Function.h"  #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h"  #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h"  #include "llvm/IR/Module.h" -#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Operator.h"  #include "llvm/IR/Type.h" +#include "llvm/IR/Use.h"  #include "llvm/IR/User.h"  #include "llvm/IR/Value.h"  #include "llvm/IR/ValueHandle.h"  #include "llvm/IR/Verifier.h"  #include "llvm/Pass.h" -#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h"  #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h"  #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h"  #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h"  #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h"  #include "llvm/Transforms/Utils/LoopSimplify.h"  #include "llvm/Transforms/Utils/LoopUtils.h"  #include "llvm/Transforms/Utils/LoopVersioning.h" -#include "llvm/Transforms/Vectorize.h"  #include <algorithm> -#include <map> +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <functional> +#include <iterator> +#include <limits> +#include <memory> +#include <string>  #include <tuple> +#include <utility> +#include <vector>  using namespace llvm; -using namespace llvm::PatternMatch;  #define LV_NAME "loop-vectorize"  #define DEBUG_TYPE LV_NAME @@ -245,12 +280,12 @@ createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,  namespace { -// Forward declarations. -class LoopVectorizeHints;  class LoopVectorizationLegality;  class LoopVectorizationCostModel;  class LoopVectorizationRequirements; +} // end anonymous namespace +  /// Returns true if the given loop body has a cycle, excluding the loop  /// itself.  static bool hasCyclesInLoopBody(const Loop &L) { @@ -324,7 +359,6 @@ static unsigned getMemInstAddressSpace(Value *I) {  /// type is irregular if its allocated size doesn't equal the store size of an  /// element of the corresponding vector type at the given vectorization factor.  static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { -    // Determine if an array of VF elements of type Ty is "bitcast compatible"    // with a <VF x Ty> vector.    if (VF > 1) { @@ -349,7 +383,7 @@ static unsigned getReciprocalPredBlockProb() { return 2; }  static Value *addFastMathFlag(Value *V) {    if (isa<FPMathOperator>(V)) {      FastMathFlags Flags; -    Flags.setUnsafeAlgebra(); +    Flags.setFast();      cast<Instruction>(V)->setFastMathFlags(Flags);    }    return V; @@ -362,6 +396,8 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {                             : ConstantFP::get(Ty, C);  } +namespace llvm { +  /// InnerLoopVectorizer vectorizes loops which contain only one basic  /// block to a specified vectorization factor (VF).  /// This class performs the widening of scalars into vectors, or multiple @@ -387,16 +423,16 @@ public:                        LoopVectorizationCostModel *CM)        : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),          AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), -        Builder(PSE.getSE()->getContext()), Induction(nullptr), -        OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth), -        TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM), -        AddedSafetyChecks(false) {} +        Builder(PSE.getSE()->getContext()), +        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} +  virtual ~InnerLoopVectorizer() = default;    /// Create a new empty loop. Unlink the old loop and connect the new one. -  void createVectorizedLoopSkeleton(); +  /// Return the pre-header block of the new loop. +  BasicBlock *createVectorizedLoopSkeleton(); -  /// Vectorize a single instruction within the innermost loop. -  void vectorizeInstruction(Instruction &I); +  /// Widen a single instruction within the innermost loop. +  void widenInstruction(Instruction &I);    /// Fix the vectorized code, taking care of header phi's, live-outs, and more.    void fixVectorizedLoop(); @@ -404,28 +440,83 @@ public:    // Return true if any runtime check is added.    bool areSafetyChecksAdded() { return AddedSafetyChecks; } -  virtual ~InnerLoopVectorizer() {} - -protected: -  /// A small list of PHINodes. -  typedef SmallVector<PHINode *, 4> PhiVector; -    /// A type for vectorized values in the new loop. Each value from the    /// original loop, when vectorized, is represented by UF vector values in the    /// new unrolled loop, where UF is the unroll factor. -  typedef SmallVector<Value *, 2> VectorParts; +  using VectorParts = SmallVector<Value *, 2>; + +  /// Vectorize a single PHINode in a block. This method handles the induction +  /// variable canonicalization. It supports both VF = 1 for unrolled loops and +  /// arbitrary length vectors. +  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + +  /// A helper function to scalarize a single Instruction in the innermost loop. +  /// Generates a sequence of scalar instances for each lane between \p MinLane +  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, +  /// inclusive.. +  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, +                            bool IfPredicateInstr); + +  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc +  /// is provided, the integer induction variable will first be truncated to +  /// the corresponding type. +  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); + +  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a +  /// vector or scalar value on-demand if one is not yet available. When +  /// vectorizing a loop, we visit the definition of an instruction before its +  /// uses. When visiting the definition, we either vectorize or scalarize the +  /// instruction, creating an entry for it in the corresponding map. (In some +  /// cases, such as induction variables, we will create both vector and scalar +  /// entries.) Then, as we encounter uses of the definition, we derive values +  /// for each scalar or vector use unless such a value is already available. +  /// For example, if we scalarize a definition and one of its uses is vector, +  /// we build the required vector on-demand with an insertelement sequence +  /// when visiting the use. Otherwise, if the use is scalar, we can use the +  /// existing scalar definition. +  /// +  /// Return a value in the new loop corresponding to \p V from the original +  /// loop at unroll index \p Part. If the value has already been vectorized, +  /// the corresponding vector entry in VectorLoopValueMap is returned. If, +  /// however, the value has a scalar entry in VectorLoopValueMap, we construct +  /// a new vector value on-demand by inserting the scalar values into a vector +  /// with an insertelement sequence. If the value has been neither vectorized +  /// nor scalarized, it must be loop invariant, so we simply broadcast the +  /// value into a vector. +  Value *getOrCreateVectorValue(Value *V, unsigned Part); + +  /// Return a value in the new loop corresponding to \p V from the original +  /// loop at unroll and vector indices \p Instance. If the value has been +  /// vectorized but not scalarized, the necessary extractelement instruction +  /// will be generated. +  Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); + +  /// Construct the vector value of a scalarized value \p V one lane at a time. +  void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); + +  /// Try to vectorize the interleaved access group that \p Instr belongs to. +  void vectorizeInterleaveGroup(Instruction *Instr); + +  /// Vectorize Load and Store instructions, optionally masking the vector +  /// operations if \p BlockInMask is non-null. +  void vectorizeMemoryInstruction(Instruction *Instr, +                                  VectorParts *BlockInMask = nullptr); + +  /// \brief Set the debug location in the builder using the debug location in +  /// the instruction. +  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + +protected: +  friend class LoopVectorizationPlanner; + +  /// A small list of PHINodes. +  using PhiVector = SmallVector<PHINode *, 4>;    /// A type for scalarized values in the new loop. Each value from the    /// original loop, when scalarized, is represented by UF x VF scalar values    /// in the new unrolled loop, where UF is the unroll factor and VF is the    /// vectorization factor. -  typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts; - -  // When we if-convert we need to create edge masks. We have to cache values -  // so that we don't end up with exponential recursion/IR. -  typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts> -      EdgeMaskCacheTy; -  typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy; +  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;    /// Set up the values of the IVs correctly when exiting the vector loop.    void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, @@ -457,40 +548,14 @@ protected:    /// the block that was created for it.    void sinkScalarOperands(Instruction *PredInst); -  /// Predicate conditional instructions that require predication on their -  /// respective conditions. -  void predicateInstructions(); -    /// Shrinks vector element sizes to the smallest bitwidth they can be legally    /// represented as.    void truncateToMinimalBitwidths(); -  /// A helper function that computes the predicate of the block BB, assuming -  /// that the header block of the loop is set to True. It returns the *entry* -  /// mask for the block BB. -  VectorParts createBlockInMask(BasicBlock *BB); -  /// A helper function that computes the predicate of the edge between SRC -  /// and DST. -  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - -  /// Vectorize a single PHINode in a block. This method handles the induction -  /// variable canonicalization. It supports both VF = 1 for unrolled loops and -  /// arbitrary length vectors. -  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); -    /// Insert the new loop to the loop hierarchy and pass manager    /// and update the analysis passes.    void updateAnalysis(); -  /// This instruction is un-vectorizable. Implement it as a sequence -  /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each -  /// scalarized instruction behind an if block predicated on the control -  /// dependence of the instruction. -  void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false); - -  /// Vectorize Load and Store instructions, -  virtual void vectorizeMemoryInstruction(Instruction *Instr); -    /// Create a broadcast instruction. This method generates a broadcast    /// instruction (shuffle) for loop invariant values and for the induction    /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -521,11 +586,6 @@ protected:    void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,                                         Value *Step, Instruction *EntryVal); -  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc -  /// is provided, the integer induction variable will first be truncated to -  /// the corresponding type. -  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); -    /// Returns true if an instruction \p I should be scalarized instead of    /// vectorized for the chosen vectorization factor.    bool shouldScalarizeInstruction(Instruction *I) const; @@ -533,37 +593,19 @@ protected:    /// Returns true if we should generate a scalar version of \p IV.    bool needsScalarInduction(Instruction *IV) const; -  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a -  /// vector or scalar value on-demand if one is not yet available. When -  /// vectorizing a loop, we visit the definition of an instruction before its -  /// uses. When visiting the definition, we either vectorize or scalarize the -  /// instruction, creating an entry for it in the corresponding map. (In some -  /// cases, such as induction variables, we will create both vector and scalar -  /// entries.) Then, as we encounter uses of the definition, we derive values -  /// for each scalar or vector use unless such a value is already available. -  /// For example, if we scalarize a definition and one of its uses is vector, -  /// we build the required vector on-demand with an insertelement sequence -  /// when visiting the use. Otherwise, if the use is scalar, we can use the -  /// existing scalar definition. -  /// -  /// Return a value in the new loop corresponding to \p V from the original -  /// loop at unroll index \p Part. If the value has already been vectorized, -  /// the corresponding vector entry in VectorLoopValueMap is returned. If, -  /// however, the value has a scalar entry in VectorLoopValueMap, we construct -  /// a new vector value on-demand by inserting the scalar values into a vector -  /// with an insertelement sequence. If the value has been neither vectorized -  /// nor scalarized, it must be loop invariant, so we simply broadcast the -  /// value into a vector. -  Value *getOrCreateVectorValue(Value *V, unsigned Part); - -  /// Return a value in the new loop corresponding to \p V from the original -  /// loop at unroll index \p Part and vector index \p Lane. If the value has -  /// been vectorized but not scalarized, the necessary extractelement -  /// instruction will be generated. -  Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane); - -  /// Try to vectorize the interleaved access group that \p Instr belongs to. -  void vectorizeInterleaveGroup(Instruction *Instr); +  /// If there is a cast involved in the induction variable \p ID, which should  +  /// be ignored in the vectorized loop body, this function records the  +  /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the  +  /// cast. We had already proved that the casted Phi is equal to the uncasted  +  /// Phi in the vectorized loop (under a runtime guard), and therefore  +  /// there is no need to vectorize the cast - the same value can be used in the  +  /// vector loop for both the Phi and the cast.  +  /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, +  /// Otherwise, \p VectorLoopValue is a widened/vectorized value. +  void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID, +                                              Value *VectorLoopValue,  +                                              unsigned Part,  +                                              unsigned Lane = UINT_MAX);    /// Generate a shuffle sequence that will reverse the vector Vec.    virtual Value *reverseVector(Value *Vec); @@ -574,12 +616,19 @@ protected:    /// Returns (and creates if needed) the trip count of the widened loop.    Value *getOrCreateVectorTripCount(Loop *NewLoop); +  /// Returns a bitcasted value to the requested vector type. +  /// Also handles bitcasts of vector<float> <-> vector<pointer> types. +  Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, +                                const DataLayout &DL); +    /// Emit a bypass check to see if the vector trip count is zero, including if    /// it overflows.    void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); +    /// Emit a bypass check to see if all of the SCEV assumptions we've    /// had to make are correct.    void emitSCEVChecks(Loop *L, BasicBlock *Bypass); +    /// Emit bypass checks to check any memory assumptions we may have made.    void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); @@ -601,149 +650,32 @@ protected:    /// vector of instructions.    void addMetadata(ArrayRef<Value *> To, Instruction *From); -  /// \brief Set the debug location in the builder using the debug location in -  /// the instruction. -  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); - -  /// This is a helper class for maintaining vectorization state. It's used for -  /// mapping values from the original loop to their corresponding values in -  /// the new loop. Two mappings are maintained: one for vectorized values and -  /// one for scalarized values. Vectorized values are represented with UF -  /// vector values in the new loop, and scalarized values are represented with -  /// UF x VF scalar values in the new loop. UF and VF are the unroll and -  /// vectorization factors, respectively. -  /// -  /// Entries can be added to either map with setVectorValue and setScalarValue, -  /// which assert that an entry was not already added before. If an entry is to -  /// replace an existing one, call resetVectorValue. This is currently needed -  /// to modify the mapped values during "fix-up" operations that occur once the -  /// first phase of widening is complete. These operations include type -  /// truncation and the second phase of recurrence widening. -  /// -  /// Entries from either map can be retrieved using the getVectorValue and -  /// getScalarValue functions, which assert that the desired value exists. - -  struct ValueMap { - -    /// Construct an empty map with the given unroll and vectorization factors. -    ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} - -    /// \return True if the map has any vector entry for \p Key. -    bool hasAnyVectorValue(Value *Key) const { -      return VectorMapStorage.count(Key); -    } - -    /// \return True if the map has a vector entry for \p Key and \p Part. -    bool hasVectorValue(Value *Key, unsigned Part) const { -      assert(Part < UF && "Queried Vector Part is too large."); -      if (!hasAnyVectorValue(Key)) -        return false; -      const VectorParts &Entry = VectorMapStorage.find(Key)->second; -      assert(Entry.size() == UF && "VectorParts has wrong dimensions."); -      return Entry[Part] != nullptr; -    } - -    /// \return True if the map has any scalar entry for \p Key. -    bool hasAnyScalarValue(Value *Key) const { -      return ScalarMapStorage.count(Key); -    } - -    /// \return True if the map has a scalar entry for \p Key, \p Part and -    /// \p Part. -    bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const { -      assert(Part < UF && "Queried Scalar Part is too large."); -      assert(Lane < VF && "Queried Scalar Lane is too large."); -      if (!hasAnyScalarValue(Key)) -        return false; -      const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; -      assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); -      assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions."); -      return Entry[Part][Lane] != nullptr; -    } - -    /// Retrieve the existing vector value that corresponds to \p Key and -    /// \p Part. -    Value *getVectorValue(Value *Key, unsigned Part) { -      assert(hasVectorValue(Key, Part) && "Getting non-existent value."); -      return VectorMapStorage[Key][Part]; -    } - -    /// Retrieve the existing scalar value that corresponds to \p Key, \p Part -    /// and \p Lane. -    Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) { -      assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value."); -      return ScalarMapStorage[Key][Part][Lane]; -    } - -    /// Set a vector value associated with \p Key and \p Part. Assumes such a -    /// value is not already set. If it is, use resetVectorValue() instead. -    void setVectorValue(Value *Key, unsigned Part, Value *Vector) { -      assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); -      if (!VectorMapStorage.count(Key)) { -        VectorParts Entry(UF); -        VectorMapStorage[Key] = Entry; -      } -      VectorMapStorage[Key][Part] = Vector; -    } - -    /// Set a scalar value associated with \p Key for \p Part and \p Lane. -    /// Assumes such a value is not already set. -    void setScalarValue(Value *Key, unsigned Part, unsigned Lane, -                        Value *Scalar) { -      assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set"); -      if (!ScalarMapStorage.count(Key)) { -        ScalarParts Entry(UF); -        for (unsigned Part = 0; Part < UF; ++Part) -          Entry[Part].resize(VF, nullptr); -          // TODO: Consider storing uniform values only per-part, as they occupy -          //       lane 0 only, keeping the other VF-1 redundant entries null. -        ScalarMapStorage[Key] = Entry; -      } -      ScalarMapStorage[Key][Part][Lane] = Scalar; -    } - -    /// Reset the vector value associated with \p Key for the given \p Part. -    /// This function can be used to update values that have already been -    /// vectorized. This is the case for "fix-up" operations including type -    /// truncation and the second phase of recurrence vectorization. -    void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { -      assert(hasVectorValue(Key, Part) && "Vector value not set for part"); -      VectorMapStorage[Key][Part] = Vector; -    } - -  private: -    /// The unroll factor. Each entry in the vector map contains UF vector -    /// values. -    unsigned UF; - -    /// The vectorization factor. Each entry in the scalar map contains UF x VF -    /// scalar values. -    unsigned VF; - -    /// The vector and scalar map storage. We use std::map and not DenseMap -    /// because insertions to DenseMap invalidate its iterators. -    std::map<Value *, VectorParts> VectorMapStorage; -    std::map<Value *, ScalarParts> ScalarMapStorage; -  }; -    /// The original loop.    Loop *OrigLoop; +    /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies    /// dynamic knowledge to simplify SCEV expressions and converts them to a    /// more usable form.    PredicatedScalarEvolution &PSE; +    /// Loop Info.    LoopInfo *LI; +    /// Dominator Tree.    DominatorTree *DT; +    /// Alias Analysis.    AliasAnalysis *AA; +    /// Target Library Info.    const TargetLibraryInfo *TLI; +    /// Target Transform Info.    const TargetTransformInfo *TTI; +    /// Assumption Cache.    AssumptionCache *AC; +    /// Interface to emit optimization remarks.    OptimizationRemarkEmitter *ORE; @@ -758,7 +690,6 @@ protected:    /// vector elements.    unsigned VF; -protected:    /// The vectorization unroll factor to use. Each scalar is vectorized to this    /// many different vector instructions.    unsigned UF; @@ -770,39 +701,45 @@ protected:    /// The vector-loop preheader.    BasicBlock *LoopVectorPreHeader; +    /// The scalar-loop preheader.    BasicBlock *LoopScalarPreHeader; +    /// Middle Block between the vector and the scalar.    BasicBlock *LoopMiddleBlock; +    /// The ExitBlock of the scalar loop.    BasicBlock *LoopExitBlock; +    /// The vector loop body.    BasicBlock *LoopVectorBody; +    /// The scalar loop body.    BasicBlock *LoopScalarBody; +    /// A list of all bypass blocks. The first block is the entry of the loop.    SmallVector<BasicBlock *, 4> LoopBypassBlocks;    /// The new Induction variable which was added to the new block. -  PHINode *Induction; +  PHINode *Induction = nullptr; +    /// The induction variable of the old basic block. -  PHINode *OldInduction; +  PHINode *OldInduction = nullptr;    /// Maps values from the original loop to their corresponding values in the    /// vectorized loop. A key value can map to either vector values, scalar    /// values or both kinds of values, depending on whether the key was    /// vectorized and scalarized. -  ValueMap VectorLoopValueMap; +  VectorizerValueMap VectorLoopValueMap; + +  /// Store instructions that were predicated. +  SmallVector<Instruction *, 4> PredicatedInstructions; -  /// Store instructions that should be predicated, as a pair -  ///   <StoreInst, Predicate> -  SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions; -  EdgeMaskCacheTy EdgeMaskCache; -  BlockMaskCacheTy BlockMaskCache;    /// Trip count of the original loop. -  Value *TripCount; +  Value *TripCount = nullptr; +    /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) -  Value *VectorTripCount; +  Value *VectorTripCount = nullptr;    /// The legality analysis.    LoopVectorizationLegality *Legal; @@ -811,7 +748,7 @@ protected:    LoopVectorizationCostModel *Cost;    // Record whether runtime checks are added. -  bool AddedSafetyChecks; +  bool AddedSafetyChecks = false;    // Holds the end values for each induction variable. We save the end values    // so we can later fix-up the external users of the induction variables. @@ -831,7 +768,6 @@ public:                              UnrollFactor, LVL, CM) {}  private: -  void vectorizeMemoryInstruction(Instruction *Instr) override;    Value *getBroadcastInstrs(Value *V) override;    Value *getStepVector(Value *Val, int StartIdx, Value *Step,                         Instruction::BinaryOps Opcode = @@ -839,6 +775,8 @@ private:    Value *reverseVector(Value *Vec) override;  }; +} // end namespace llvm +  /// \brief Look for a meaningful debug location on the instruction or it's  /// operands.  static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { @@ -861,7 +799,8 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {  void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {    if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {      const DILocation *DIL = Inst->getDebugLoc(); -    if (DIL && Inst->getFunction()->isDebugInfoForProfiling()) +    if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && +        !isa<DbgInfoIntrinsic>(Inst))        B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));      else        B.SetCurrentDebugLocation(DIL); @@ -908,6 +847,8 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,    }  } +namespace llvm { +  /// \brief The group of interleaved loads/stores sharing the same stride and  /// close to each other.  /// @@ -937,7 +878,7 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,  class InterleaveGroup {  public:    InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) -      : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) { +      : Align(Align), InsertPos(Instr) {      assert(Align && "The alignment should be non-zero");      Factor = std::abs(Stride); @@ -1010,13 +951,26 @@ public:    Instruction *getInsertPos() const { return InsertPos; }    void setInsertPos(Instruction *Inst) { InsertPos = Inst; } +  /// Add metadata (e.g. alias info) from the instructions in this group to \p +  /// NewInst. +  /// +  /// FIXME: this function currently does not add noalias metadata a'la +  /// addNewMedata.  To do that we need to compute the intersection of the +  /// noalias info from all members. +  void addMetadata(Instruction *NewInst) const { +    SmallVector<Value *, 4> VL; +    std::transform(Members.begin(), Members.end(), std::back_inserter(VL), +                   [](std::pair<int, Instruction *> p) { return p.second; }); +    propagateMetadata(NewInst, VL); +  } +  private:    unsigned Factor; // Interleave Factor.    bool Reverse;    unsigned Align;    DenseMap<int, Instruction *> Members; -  int SmallestKey; -  int LargestKey; +  int SmallestKey = 0; +  int LargestKey = 0;    // To avoid breaking dependences, vectorized instructions of an interleave    // group should be inserted at either the first load or the last store in @@ -1031,6 +985,9 @@ private:    //      store i32 %odd               // Insert Position    Instruction *InsertPos;  }; +} // end namespace llvm + +namespace {  /// \brief Drive the analysis of interleaved memory accesses in the loop.  /// @@ -1044,8 +1001,7 @@ class InterleavedAccessInfo {  public:    InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,                          DominatorTree *DT, LoopInfo *LI) -      : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr), -        RequiresScalarEpilogue(false) {} +      : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}    ~InterleavedAccessInfo() {      SmallSet<InterleaveGroup *, 4> DelSet; @@ -1065,14 +1021,6 @@ public:      return InterleaveGroupMap.count(Instr);    } -  /// \brief Return the maximum interleave factor of all interleaved groups. -  unsigned getMaxInterleaveFactor() const { -    unsigned MaxFactor = 1; -    for (auto &Entry : InterleaveGroupMap) -      MaxFactor = std::max(MaxFactor, Entry.second->getFactor()); -    return MaxFactor; -  } -    /// \brief Get the interleave group that \p Instr belongs to.    ///    /// \returns nullptr if doesn't have such group. @@ -1095,15 +1043,16 @@ private:    /// The interleaved access analysis can also add new predicates (for example    /// by versioning strides of pointers).    PredicatedScalarEvolution &PSE; +    Loop *TheLoop;    DominatorTree *DT;    LoopInfo *LI; -  const LoopAccessInfo *LAI; +  const LoopAccessInfo *LAI = nullptr;    /// True if the loop may contain non-reversed interleaved groups with    /// out-of-bounds accesses. We ensure we don't speculatively access memory    /// out-of-bounds by executing at least one scalar epilogue iteration. -  bool RequiresScalarEpilogue; +  bool RequiresScalarEpilogue = false;    /// Holds the relationships between the members and the interleave group.    DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; @@ -1114,21 +1063,26 @@ private:    /// \brief The descriptor for a strided memory access.    struct StrideDescriptor { +    StrideDescriptor() = default;      StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,                       unsigned Align)          : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} -    StrideDescriptor() = default; -      // The access's stride. It is negative for a reverse access.      int64_t Stride = 0; -    const SCEV *Scev = nullptr; // The scalar expression of this access -    uint64_t Size = 0;          // The size of the memory object. -    unsigned Align = 0;         // The alignment of this access. + +    // The scalar expression of this access. +    const SCEV *Scev = nullptr; + +    // The size of the memory object. +    uint64_t Size = 0; + +    // The alignment of this access. +    unsigned Align = 0;    };    /// \brief A type for holding instructions and their stride descriptors. -  typedef std::pair<Instruction *, StrideDescriptor> StrideEntry; +  using StrideEntry = std::pair<Instruction *, StrideDescriptor>;    /// \brief Create a new interleave group with the given instruction \p Instr,    /// stride \p Stride and alignment \p Align. @@ -1179,7 +1133,6 @@ private:    /// not necessary or is prevented because \p A and \p B may be dependent.    bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,                                                   StrideEntry *B) const { -      // Code motion for interleaved accesses can potentially hoist strided loads      // and sink strided stores. The code below checks the legality of the      // following two conditions: @@ -1244,7 +1197,7 @@ private:  /// for example 'force', means a decision has been made. So, we need to be  /// careful NOT to add them if the user hasn't specifically asked so.  class LoopVectorizeHints { -  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE }; +  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };    /// Hint - associates name and validation with the hint value.    struct Hint { @@ -1263,6 +1216,8 @@ class LoopVectorizeHints {          return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;        case HK_FORCE:          return (Val <= 1); +      case HK_ISVECTORIZED: +        return (Val==0 || Val==1);        }        return false;      } @@ -1270,16 +1225,21 @@ class LoopVectorizeHints {    /// Vectorization width.    Hint Width; +    /// Vectorization interleave factor.    Hint Interleave; +    /// Vectorization forced    Hint Force; +  /// Already Vectorized +  Hint IsVectorized; +    /// Return the loop metadata prefix.    static StringRef Prefix() { return "llvm.loop."; }    /// True if there is any unsafe math in the loop. -  bool PotentiallyUnsafe; +  bool PotentiallyUnsafe = false;  public:    enum ForceKind { @@ -1294,7 +1254,7 @@ public:                HK_WIDTH),          Interleave("interleave.count", DisableInterleaving, HK_UNROLL),          Force("vectorize.enable", FK_Undefined, HK_FORCE), -        PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) { +        IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {      // Populate values with existing loop metadata.      getHintsFromMetadata(); @@ -1302,14 +1262,19 @@ public:      if (VectorizerParams::isInterleaveForced())        Interleave.Value = VectorizerParams::VectorizationInterleave; +    if (IsVectorized.Value != 1) +      // If the vectorization width and interleaving count are both 1 then +      // consider the loop to have been already vectorized because there's +      // nothing more that we can do. +      IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;      DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()            << "LV: Interleaving disabled by the pass manager\n");    }    /// Mark the loop L as already vectorized by setting the width to 1.    void setAlreadyVectorized() { -    Width.Value = Interleave.Value = 1; -    Hint Hints[] = {Width, Interleave}; +    IsVectorized.Value = 1; +    Hint Hints[] = {IsVectorized};      writeHintsToMetadata(Hints);    } @@ -1326,19 +1291,19 @@ public:        return false;      } -    if (getWidth() == 1 && getInterleave() == 1) { -      // FIXME: Add a separate metadata to indicate when the loop has already -      // been vectorized instead of setting width and count to 1. +    if (getIsVectorized() == 1) {        DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");        // FIXME: Add interleave.disable metadata. This will allow        // vectorize.disable to be used without disabling the pass and errors        // to differentiate between disabled vectorization and a width of 1. -      ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), +      ORE.emit([&]() { +        return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),                                            "AllDisabled", L->getStartLoc(),                                            L->getHeader())                 << "loop not vectorized: vectorization and interleaving are " -                  "explicitly disabled, or vectorize width and interleave " -                  "count are both set to 1"); +                  "explicitly disabled, or the loop has already been " +                  "vectorized"; +      });        return false;      } @@ -1348,29 +1313,35 @@ public:    /// Dumps all the hint information.    void emitRemarkWithHints() const {      using namespace ore; -    if (Force.Value == LoopVectorizeHints::FK_Disabled) -      ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", + +    ORE.emit([&]() { +      if (Force.Value == LoopVectorizeHints::FK_Disabled) +        return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",                                          TheLoop->getStartLoc(),                                          TheLoop->getHeader()) -               << "loop not vectorized: vectorization is explicitly disabled"); -    else { -      OptimizationRemarkMissed R(LV_NAME, "MissedDetails", -                                 TheLoop->getStartLoc(), TheLoop->getHeader()); -      R << "loop not vectorized"; -      if (Force.Value == LoopVectorizeHints::FK_Enabled) { -        R << " (Force=" << NV("Force", true); -        if (Width.Value != 0) -          R << ", Vector Width=" << NV("VectorWidth", Width.Value); -        if (Interleave.Value != 0) -          R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); -        R << ")"; +               << "loop not vectorized: vectorization is explicitly disabled"; +      else { +        OptimizationRemarkMissed R(LV_NAME, "MissedDetails", +                                   TheLoop->getStartLoc(), +                                   TheLoop->getHeader()); +        R << "loop not vectorized"; +        if (Force.Value == LoopVectorizeHints::FK_Enabled) { +          R << " (Force=" << NV("Force", true); +          if (Width.Value != 0) +            R << ", Vector Width=" << NV("VectorWidth", Width.Value); +          if (Interleave.Value != 0) +            R << ", Interleave Count=" +              << NV("InterleaveCount", Interleave.Value); +          R << ")"; +        } +        return R;        } -      ORE.emit(R); -    } +    });    }    unsigned getWidth() const { return Width.Value; }    unsigned getInterleave() const { return Interleave.Value; } +  unsigned getIsVectorized() const { return IsVectorized.Value; }    enum ForceKind getForce() const { return (ForceKind)Force.Value; }    /// \brief If hints are provided that force vectorization, use the AlwaysPrint @@ -1454,7 +1425,7 @@ private:        return;      unsigned Val = C->getZExtValue(); -    Hint *Hints[] = {&Width, &Interleave, &Force}; +    Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};      for (auto H : Hints) {        if (Name == H->Name) {          if (H->validate(Val)) @@ -1489,7 +1460,7 @@ private:    /// Sets current hints into loop metadata, keeping other values intact.    void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { -    if (HintTypes.size() == 0) +    if (HintTypes.empty())        return;      // Reserve the first element to LoopID (see below). @@ -1525,6 +1496,8 @@ private:    OptimizationRemarkEmitter &ORE;  }; +} // end anonymous namespace +  static void emitMissedWarning(Function *F, Loop *L,                                const LoopVectorizeHints &LH,                                OptimizationRemarkEmitter *ORE) { @@ -1546,6 +1519,8 @@ static void emitMissedWarning(Function *F, Loop *L,    }  } +namespace { +  /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and  /// to what vectorization factor.  /// This class does not look at the profitability of vectorization, only the @@ -1568,22 +1543,20 @@ public:        std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,        OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,        LoopVectorizeHints *H) -      : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), -        GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI), -        PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), -        Requirements(R), Hints(H) {} +      : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA), +        ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {}    /// ReductionList contains the reduction descriptors for all    /// of the reductions that were found in the loop. -  typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList; +  using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;    /// InductionList saves induction variables and maps them to the    /// induction descriptor. -  typedef MapVector<PHINode *, InductionDescriptor> InductionList; +  using InductionList = MapVector<PHINode *, InductionDescriptor>;    /// RecurrenceSet contains the phi nodes that are recurrences other than    /// inductions and reductions. -  typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet; +  using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;    /// Returns true if it is legal to vectorize this loop.    /// This does not mean that it is profitable to vectorize this @@ -1608,7 +1581,17 @@ public:    /// Returns the widest induction type.    Type *getWidestInductionType() { return WidestIndTy; } -  /// Returns True if V is an induction variable in this loop. +  /// Returns True if V is a Phi node of an induction variable in this loop. +  bool isInductionPhi(const Value *V); + +  /// Returns True if V is a cast that is part of an induction def-use chain, +  /// and had been proven to be redundant under a runtime guard (in other +  /// words, the cast has the same SCEV expression as the induction phi). +  bool isCastedInductionVariable(const Value *V); + +  /// Returns True if V can be considered as an induction variable in this  +  /// loop. V can be the induction phi, or some redundant cast in the def-use +  /// chain of the inducion phi.    bool isInductionVariable(const Value *V);    /// Returns True if PN is a reduction variable in this loop. @@ -1629,6 +1612,8 @@ public:    /// 0 - Stride is unknown or non-consecutive.    /// 1 - Address is consecutive.    /// -1 - Address is consecutive, and decreasing. +  /// NOTE: This method must only be used before modifying the original scalar +  /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).    int isConsecutivePtr(Value *Ptr);    /// Returns true if the value V is uniform within the loop. @@ -1646,11 +1631,6 @@ public:      return InterleaveInfo.isInterleaved(Instr);    } -  /// \brief Return the maximum interleave factor of all interleaved groups. -  unsigned getMaxInterleaveFactor() const { -    return InterleaveInfo.getMaxInterleaveFactor(); -  } -    /// \brief Get the interleaved access group that \p Instr belongs to.    const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {      return InterleaveInfo.getInterleaveGroup(Instr); @@ -1664,6 +1644,10 @@ public:    unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } +  uint64_t getMaxSafeRegisterWidth() const { +	  return LAI->getDepChecker().getMaxSafeRegisterWidth(); +  } +    bool hasStride(Value *V) { return LAI->hasStride(V); }    /// Returns true if the target machine supports masked store operation @@ -1671,21 +1655,25 @@ public:    bool isLegalMaskedStore(Type *DataType, Value *Ptr) {      return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);    } +    /// Returns true if the target machine supports masked load operation    /// for the given \p DataType and kind of access to \p Ptr.    bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {      return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);    } +    /// Returns true if the target machine supports masked scatter operation    /// for the given \p DataType.    bool isLegalMaskedScatter(Type *DataType) {      return TTI->isLegalMaskedScatter(DataType);    } +    /// Returns true if the target machine supports masked gather operation    /// for the given \p DataType.    bool isLegalMaskedGather(Type *DataType) {      return TTI->isLegalMaskedGather(DataType);    } +    /// Returns true if the target machine can represent \p V as a masked gather    /// or scatter operation.    bool isLegalGatherOrScatter(Value *V) { @@ -1701,6 +1689,7 @@ public:    /// Returns true if vector representation of the instruction \p I    /// requires mask.    bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); } +    unsigned getNumStores() const { return LAI->getNumStores(); }    unsigned getNumLoads() const { return LAI->getNumLoads(); }    unsigned getNumPredStores() const { return NumPredStores; } @@ -1766,27 +1755,34 @@ private:      return LAI ? &LAI->getSymbolicStrides() : nullptr;    } -  unsigned NumPredStores; +  unsigned NumPredStores = 0;    /// The loop that we evaluate.    Loop *TheLoop; +    /// A wrapper around ScalarEvolution used to add runtime SCEV checks.    /// Applies dynamic knowledge to simplify SCEV expressions in the context    /// of existing SCEV assumptions. The analysis will also add a minimal set    /// of new predicates if this is required to enable vectorization and    /// unrolling.    PredicatedScalarEvolution &PSE; +    /// Target Library Info.    TargetLibraryInfo *TLI; +    /// Target Transform Info    const TargetTransformInfo *TTI; +    /// Dominator Tree.    DominatorTree *DT; +    // LoopAccess analysis.    std::function<const LoopAccessInfo &(Loop &)> *GetLAA; +    // And the loop-accesses info corresponding to this loop.  This pointer is    // null until canVectorizeMemory sets it up. -  const LoopAccessInfo *LAI; +  const LoopAccessInfo *LAI = nullptr; +    /// Interface to emit optimization remarks.    OptimizationRemarkEmitter *ORE; @@ -1798,27 +1794,38 @@ private:    /// Holds the primary induction variable. This is the counter of the    /// loop. -  PHINode *PrimaryInduction; +  PHINode *PrimaryInduction = nullptr; +    /// Holds the reduction variables.    ReductionList Reductions; +    /// Holds all of the induction variables that we found in the loop.    /// Notice that inductions don't need to start at zero and that induction    /// variables can be pointers.    InductionList Inductions; + +  /// Holds all the casts that participate in the update chain of the induction  +  /// variables, and that have been proven to be redundant (possibly under a  +  /// runtime guard). These casts can be ignored when creating the vectorized  +  /// loop body. +  SmallPtrSet<Instruction *, 4> InductionCastsToIgnore; +    /// Holds the phi nodes that are first-order recurrences.    RecurrenceSet FirstOrderRecurrences; +    /// Holds instructions that need to sink past other instructions to handle    /// first-order recurrences.    DenseMap<Instruction *, Instruction *> SinkAfter; +    /// Holds the widest induction type encountered. -  Type *WidestIndTy; +  Type *WidestIndTy = nullptr;    /// Allowed outside users. This holds the induction and reduction    /// vars which can be accessed from outside the loop.    SmallPtrSet<Value *, 4> AllowedExit;    /// Can we assume the absence of NaNs. -  bool HasFunNoNaNAttr; +  bool HasFunNoNaNAttr = false;    /// Vectorization requirements that will go through late-evaluation.    LoopVectorizationRequirements *Requirements; @@ -1856,9 +1863,13 @@ public:    /// Information about vectorization costs    struct VectorizationFactor { -    unsigned Width; // Vector width with best cost -    unsigned Cost;  // Cost of the loop with that width +    // Vector width with best cost +    unsigned Width; + +    // Cost of the loop with that width +    unsigned Cost;    }; +    /// \return The most profitable vectorization factor and the cost of that VF.    /// This method checks every power of two up to MaxVF. If UserVF is not ZERO    /// then this vectorization factor will be selected if vectorization is @@ -1897,8 +1908,10 @@ public:    struct RegisterUsage {      /// Holds the number of loop invariant values that are used in the loop.      unsigned LoopInvariantRegs; +      /// Holds the maximum number of concurrent live intervals in the loop.      unsigned MaxLocalUsers; +      /// Holds the number of instructions in the loop.      unsigned NumInstructions;    }; @@ -1920,6 +1933,7 @@ public:    /// \returns True if it is more profitable to scalarize instruction \p I for    /// vectorization factor \p VF.    bool isProfitableToScalarize(Instruction *I, unsigned VF) const { +    assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");      auto Scalars = InstsToScalarize.find(VF);      assert(Scalars != InstsToScalarize.end() &&             "VF not yet analyzed for scalarization profitability"); @@ -1954,7 +1968,8 @@ public:    /// Decision that was taken during cost calculation for memory instruction.    enum InstWidening {      CM_Unknown, -    CM_Widen, +    CM_Widen,         // For consecutive accesses with stride +1. +    CM_Widen_Reverse, // For consecutive accesses with stride -1.      CM_Interleave,      CM_GatherScatter,      CM_Scalarize @@ -2010,7 +2025,6 @@ public:    /// is an induction variable. Such a truncate will be removed by adding a new    /// induction variable with the destination type.    bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { -      // If the instruction is not a truncate, return false.      auto *Trunc = dyn_cast<TruncInst>(I);      if (!Trunc) @@ -2030,13 +2044,29 @@ public:        return false;      // If the truncated value is not an induction variable, return false. -    return Legal->isInductionVariable(Op); +    return Legal->isInductionPhi(Op); +  } + +  /// Collects the instructions to scalarize for each predicated instruction in +  /// the loop. +  void collectInstsToScalarize(unsigned VF); + +  /// Collect Uniform and Scalar values for the given \p VF. +  /// The sets depend on CM decision for Load/Store instructions +  /// that may be vectorized as interleave, gather-scatter or scalarized. +  void collectUniformsAndScalars(unsigned VF) { +    // Do the analysis once. +    if (VF == 1 || Uniforms.count(VF)) +      return; +    setCostBasedWideningDecision(VF); +    collectLoopUniforms(VF); +    collectLoopScalars(VF);    }  private:    /// \return An upper bound for the vectorization factor, larger than zero.    /// One is returned if vectorization should best be avoided due to cost. -  unsigned computeFeasibleMaxVF(bool OptForSize); +  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);    /// The vectorization cost is a combination of the cost itself and a boolean    /// indicating whether any of the contributing operations will actually @@ -2045,7 +2075,7 @@ private:    /// is    /// false, then all operations will be scalarized (i.e. no vectorization has    /// actually taken place). -  typedef std::pair<unsigned, bool> VectorizationCostTy; +  using VectorizationCostTy = std::pair<unsigned, bool>;    /// Returns the expected execution cost. The unit of the cost does    /// not matter because we use the 'cost' units to compare different @@ -2102,7 +2132,7 @@ private:    /// A type representing the costs for instructions if they were to be    /// scalarized rather than vectorized. The entries are Instruction-Cost    /// pairs. -  typedef DenseMap<Instruction *, unsigned> ScalarCostsTy; +  using ScalarCostsTy = DenseMap<Instruction *, unsigned>;    /// A set containing all BasicBlocks that are known to present after    /// vectorization as a predicated block. @@ -2134,10 +2164,6 @@ private:    int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,                                unsigned VF); -  /// Collects the instructions to scalarize for each predicated instruction in -  /// the loop. -  void collectInstsToScalarize(unsigned VF); -    /// Collect the instructions that are uniform after vectorization. An    /// instruction is uniform if we represent it with a single scalar value in    /// the vectorized loop corresponding to each vector iteration. Examples of @@ -2156,72 +2182,137 @@ private:    /// iteration of the original scalar loop.    void collectLoopScalars(unsigned VF); -  /// Collect Uniform and Scalar values for the given \p VF. -  /// The sets depend on CM decision for Load/Store instructions -  /// that may be vectorized as interleave, gather-scatter or scalarized. -  void collectUniformsAndScalars(unsigned VF) { -    // Do the analysis once. -    if (VF == 1 || Uniforms.count(VF)) -      return; -    setCostBasedWideningDecision(VF); -    collectLoopUniforms(VF); -    collectLoopScalars(VF); -  } -    /// Keeps cost model vectorization decision and cost for instructions.    /// Right now it is used for memory instructions only. -  typedef DenseMap<std::pair<Instruction *, unsigned>, -                   std::pair<InstWidening, unsigned>> -      DecisionList; +  using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, +                                std::pair<InstWidening, unsigned>>;    DecisionList WideningDecisions;  public:    /// The loop that we evaluate.    Loop *TheLoop; +    /// Predicated scalar evolution analysis.    PredicatedScalarEvolution &PSE; +    /// Loop Info analysis.    LoopInfo *LI; +    /// Vectorization legality.    LoopVectorizationLegality *Legal; +    /// Vector target information.    const TargetTransformInfo &TTI; +    /// Target Library Info.    const TargetLibraryInfo *TLI; +    /// Demanded bits analysis.    DemandedBits *DB; +    /// Assumption cache.    AssumptionCache *AC; +    /// Interface to emit optimization remarks.    OptimizationRemarkEmitter *ORE;    const Function *TheFunction; +    /// Loop Vectorize Hint.    const LoopVectorizeHints *Hints; +    /// Values to ignore in the cost model.    SmallPtrSet<const Value *, 16> ValuesToIgnore; +    /// Values to ignore in the cost model when VF > 1.    SmallPtrSet<const Value *, 16> VecValuesToIgnore;  }; +} // end anonymous namespace + +namespace llvm { + +/// InnerLoopVectorizer vectorizes loops which contain only one basic  /// LoopVectorizationPlanner - drives the vectorization process after having  /// passed Legality checks. +/// The planner builds and optimizes the Vectorization Plans which record the +/// decisions how to vectorize the given loop. In particular, represent the +/// control-flow of the vectorized version, the replication of instructions that +/// are to be scalarized, and interleave access groups.  class LoopVectorizationPlanner { +  /// The loop that we evaluate. +  Loop *OrigLoop; + +  /// Loop Info analysis. +  LoopInfo *LI; + +  /// Target Library Info. +  const TargetLibraryInfo *TLI; + +  /// Target Transform Info. +  const TargetTransformInfo *TTI; + +  /// The legality analysis. +  LoopVectorizationLegality *Legal; + +  /// The profitablity analysis. +  LoopVectorizationCostModel &CM; + +  using VPlanPtr = std::unique_ptr<VPlan>; + +  SmallVector<VPlanPtr, 4> VPlans; + +  /// This class is used to enable the VPlan to invoke a method of ILV. This is +  /// needed until the method is refactored out of ILV and becomes reusable. +  struct VPCallbackILV : public VPCallback { +    InnerLoopVectorizer &ILV; + +    VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} + +    Value *getOrCreateVectorValues(Value *V, unsigned Part) override { +      return ILV.getOrCreateVectorValue(V, Part); +    } +  }; + +  /// A builder used to construct the current plan. +  VPBuilder Builder; + +  /// When we if-convert we need to create edge masks. We have to cache values +  /// so that we don't end up with exponential recursion/IR. Note that +  /// if-conversion currently takes place during VPlan-construction, so these +  /// caches are only used at that stage. +  using EdgeMaskCacheTy = +      DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>; +  using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>; +  EdgeMaskCacheTy EdgeMaskCache; +  BlockMaskCacheTy BlockMaskCache; + +  unsigned BestVF = 0; +  unsigned BestUF = 0; +  public: -  LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, +  LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, +                           const TargetTransformInfo *TTI,                             LoopVectorizationLegality *Legal,                             LoopVectorizationCostModel &CM) -      : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {} - -  ~LoopVectorizationPlanner() {} +      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}    /// Plan how to best vectorize, return the best VF and its cost.    LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,                                                         unsigned UserVF); -  /// Generate the IR code for the vectorized loop. -  void executePlan(InnerLoopVectorizer &ILV); +  /// Finalize the best decision and dispose of all other VPlans. +  void setBestPlan(unsigned VF, unsigned UF); + +  /// Generate the IR code for the body of the vectorized loop according to the +  /// best selected VPlan. +  void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); + +  void printPlans(raw_ostream &O) { +    for (const auto &Plan : VPlans) +      O << *Plan; +  }  protected:    /// Collect the instructions from the original loop that would be trivially @@ -2229,20 +2320,102 @@ protected:    void collectTriviallyDeadInstructions(        SmallPtrSetImpl<Instruction *> &DeadInstructions); -private: -  /// The loop that we evaluate. -  Loop *OrigLoop; +  /// A range of powers-of-2 vectorization factors with fixed start and +  /// adjustable end. The range includes start and excludes end, e.g.,: +  /// [1, 9) = {1, 2, 4, 8} +  struct VFRange { +    // A power of 2. +    const unsigned Start; -  /// Loop Info analysis. -  LoopInfo *LI; +    // Need not be a power of 2. If End <= Start range is empty. +    unsigned End; +  }; -  /// The legality analysis. -  LoopVectorizationLegality *Legal; +  /// Test a \p Predicate on a \p Range of VF's. Return the value of applying +  /// \p Predicate on Range.Start, possibly decreasing Range.End such that the +  /// returned value holds for the entire \p Range. +  bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate, +                                VFRange &Range); -  /// The profitablity analysis. -  LoopVectorizationCostModel &CM; +  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, +  /// according to the information gathered by Legal when it checked if it is +  /// legal to vectorize the loop. +  void buildVPlans(unsigned MinVF, unsigned MaxVF); + +private: +  /// A helper function that computes the predicate of the block BB, assuming +  /// that the header block of the loop is set to True. It returns the *entry* +  /// mask for the block BB. +  VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan); + +  /// A helper function that computes the predicate of the edge between SRC +  /// and DST. +  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); + +  /// Check if \I belongs to an Interleave Group within the given VF \p Range, +  /// \return true in the first returned value if so and false otherwise. +  /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG +  /// for \p Range.Start, and provide it as the second returned value. +  /// Note that if \I is an adjunct member of an IG for \p Range.Start, the +  /// \return value is <true, nullptr>, as it is handled by another recipe. +  /// \p Range.End may be decreased to ensure same decision from \p Range.Start +  /// to \p Range.End. +  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + +  // Check if \I is a memory instruction to be widened for \p Range.Start and +  // potentially masked. Such instructions are handled by a recipe that takes an +  // additional VPInstruction for the mask. +  VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I, +                                                   VFRange &Range, +                                                   VPlanPtr &Plan); + +  /// Check if an induction recipe should be constructed for \I within the given +  /// VF \p Range. If so build and return it. If not, return null. \p Range.End +  /// may be decreased to ensure same decision from \p Range.Start to +  /// \p Range.End. +  VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, +                                                        VFRange &Range); + +  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into +  /// a sequence of select instructions as the vectorizer currently performs +  /// full if-conversion. +  VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan); + +  /// Check if \p I can be widened within the given VF \p Range. If \p I can be +  /// widened for \p Range.Start, check if the last recipe of \p VPBB can be +  /// extended to include \p I or else build a new VPWidenRecipe for it and +  /// append it to \p VPBB. Return true if \p I can be widened for Range.Start, +  /// false otherwise. Range.End may be decreased to ensure same decision from +  /// \p Range.Start to \p Range.End. +  bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range); + +  /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it +  /// is predicated. \return \p VPBB augmented with this new recipe if \p I is +  /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new +  /// Region. Update the packing decision of predicated instructions if they +  /// feed \p I. Range.End may be decreased to ensure same recipe behavior from +  /// \p Range.Start to \p Range.End. +  VPBasicBlock *handleReplication( +      Instruction *I, VFRange &Range, VPBasicBlock *VPBB, +      DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, +      VPlanPtr &Plan); + +  /// Create a replicating region for instruction \p I that requires +  /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. +  VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, +                                       VPlanPtr &Plan); + +  /// Build a VPlan according to the information gathered by Legal. \return a +  /// VPlan for vectorization factors \p Range.Start and up to \p Range.End +  /// exclusive, possibly decreasing \p Range.End. +  VPlanPtr buildVPlan(VFRange &Range, +                                    const SmallPtrSetImpl<Value *> &NeedDef);  }; +} // end namespace llvm + +namespace { +  /// \brief This holds vectorization requirements that must be verified late in  /// the process. The requirements are set by legalize and costmodel. Once  /// vectorization has been determined to be possible and profitable the @@ -2257,8 +2430,7 @@ private:  /// followed by a non-expert user.  class LoopVectorizationRequirements {  public: -  LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) -      : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {} +  LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}    void addUnsafeAlgebraInst(Instruction *I) {      // First unsafe algebra instruction. @@ -2272,12 +2444,14 @@ public:      const char *PassName = Hints.vectorizeAnalysisPassName();      bool Failed = false;      if (UnsafeAlgebraInst && !Hints.allowReordering()) { -      ORE.emit( -          OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps", -                                              UnsafeAlgebraInst->getDebugLoc(), -                                              UnsafeAlgebraInst->getParent()) -          << "loop not vectorized: cannot prove it is safe to reorder " -             "floating-point operations"); +      ORE.emit([&]() { +        return OptimizationRemarkAnalysisFPCommute( +                   PassName, "CantReorderFPOps", +                   UnsafeAlgebraInst->getDebugLoc(), +                   UnsafeAlgebraInst->getParent()) +               << "loop not vectorized: cannot prove it is safe to reorder " +                  "floating-point operations"; +      });        Failed = true;      } @@ -2288,11 +2462,13 @@ public:          NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;      if ((ThresholdReached && !Hints.allowReordering()) ||          PragmaThresholdReached) { -      ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", +      ORE.emit([&]() { +        return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",                                                    L->getStartLoc(),                                                    L->getHeader())                 << "loop not vectorized: cannot prove it is safe to reorder " -                  "memory operations"); +                  "memory operations"; +      });        DEBUG(dbgs() << "LV: Too many memory checks needed.\n");        Failed = true;      } @@ -2301,13 +2477,15 @@ public:    }  private: -  unsigned NumRuntimePointerChecks; -  Instruction *UnsafeAlgebraInst; +  unsigned NumRuntimePointerChecks = 0; +  Instruction *UnsafeAlgebraInst = nullptr;    /// Interface to emit optimization remarks.    OptimizationRemarkEmitter &ORE;  }; +} // end anonymous namespace +  static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {    if (L.empty()) {      if (!hasCyclesInLoopBody(L)) @@ -2318,11 +2496,15 @@ static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {      addAcyclicInnerLoop(*InnerL, V);  } +namespace { +  /// The LoopVectorize Pass.  struct LoopVectorize : public FunctionPass {    /// Pass identification, replacement for typeid    static char ID; +  LoopVectorizePass Impl; +    explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)        : FunctionPass(ID) {      Impl.DisableUnrolling = NoUnrolling; @@ -2330,8 +2512,6 @@ struct LoopVectorize : public FunctionPass {      initializeLoopVectorizePass(*PassRegistry::getPassRegistry());    } -  LoopVectorizePass Impl; -    bool runOnFunction(Function &F) override {      if (skipFunction(F))        return false; @@ -2450,6 +2630,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(    Instruction *LastInduction = VecInd;    for (unsigned Part = 0; Part < UF; ++Part) {      VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); +    recordVectorLoopValueForInductionCast(II, LastInduction, Part);      if (isa<TruncInst>(EntryVal))        addMetadata(LastInduction, EntryVal);      LastInduction = cast<Instruction>(addFastMathFlag( @@ -2480,11 +2661,26 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {      auto *I = cast<Instruction>(U);      return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));    }; -  return any_of(IV->users(), isScalarInst); +  return llvm::any_of(IV->users(), isScalarInst);  } -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { +void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( +    const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part, +    unsigned Lane) { +  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); +  if (Casts.empty()) +    return; +  // Only the first Cast instruction in the Casts vector is of interest. +  // The rest of the Casts (if exist) have no uses outside the +  // induction update chain itself. +  Instruction *CastInst = *Casts.begin(); +  if (Lane < UINT_MAX) +    VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); +  else +    VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); +} +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {    assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&           "Primary induction variable must have an integer type"); @@ -2564,6 +2760,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {        Value *EntryPart =            getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());        VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); +      recordVectorLoopValueForInductionCast(ID, EntryPart, Part);        if (Trunc)          addMetadata(EntryPart, Trunc);      } @@ -2622,7 +2819,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,    // Floating point operations had to be 'fast' to enable the induction.    FastMathFlags Flags; -  Flags.setUnsafeAlgebra(); +  Flags.setFast();    Value *MulOp = Builder.CreateFMul(Cv, Step);    if (isa<Instruction>(MulOp)) @@ -2638,7 +2835,6 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,  void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,                                             Value *EntryVal,                                             const InductionDescriptor &ID) { -    // We shouldn't have to build scalar steps if we aren't vectorizing.    assert(VF > 1 && "VF should be greater than one"); @@ -2663,21 +2859,21 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,    // iteration. If EntryVal is uniform, we only need to generate the first    // lane. Otherwise, we generate all VF values.    unsigned Lanes = -    Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF; - +      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 +                                                                         : VF;    // Compute the scalar steps and save the results in VectorLoopValueMap.    for (unsigned Part = 0; Part < UF; ++Part) {      for (unsigned Lane = 0; Lane < Lanes; ++Lane) {        auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);        auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));        auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); -      VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add); +      VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); +      recordVectorLoopValueForInductionCast(ID, Add, Part, Lane);      }    }  }  int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { -    const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :      ValueToValueMap(); @@ -2708,8 +2904,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {    // instead. If it has been scalarized, and we actually need the value in    // vector form, we will construct the vector values on demand.    if (VectorLoopValueMap.hasAnyScalarValue(V)) { - -    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0); +    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});      // If we've scalarized a value, that value should be an instruction.      auto *I = cast<Instruction>(V); @@ -2726,8 +2921,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {      // of the Part unroll iteration. Otherwise, the last instruction is the one      // we created for the last vector lane of the Part unroll iteration.      unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; -    auto *LastInst = -        cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane)); +    auto *LastInst = cast<Instruction>( +        VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));      // Set the insert point after the last scalarized instruction. This ensures      // the insertelement sequence will directly follow the scalar definitions. @@ -2744,14 +2939,15 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {      Value *VectorValue = nullptr;      if (Cost->isUniformAfterVectorization(I, VF)) {        VectorValue = getBroadcastInstrs(ScalarValue); +      VectorLoopValueMap.setVectorValue(V, Part, VectorValue);      } else { -      VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); +      // Initialize packing with insertelements to start from undef. +      Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); +      VectorLoopValueMap.setVectorValue(V, Part, Undef);        for (unsigned Lane = 0; Lane < VF; ++Lane) -        VectorValue = Builder.CreateInsertElement( -            VectorValue, getOrCreateScalarValue(V, Part, Lane), -            Builder.getInt32(Lane)); +        packScalarIntoVectorValue(V, {Part, Lane}); +      VectorValue = VectorLoopValueMap.getVectorValue(V, Part);      } -    VectorLoopValueMap.setVectorValue(V, Part, VectorValue);      Builder.restoreIP(OldIP);      return VectorValue;    } @@ -2763,28 +2959,29 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {    return B;  } -Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part, -                                                   unsigned Lane) { - +Value * +InnerLoopVectorizer::getOrCreateScalarValue(Value *V, +                                            const VPIteration &Instance) {    // If the value is not an instruction contained in the loop, it should    // already be scalar.    if (OrigLoop->isLoopInvariant(V))      return V; -  assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) -                  : true && "Uniform values only have lane zero"); +  assert(Instance.Lane > 0 +             ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) +             : true && "Uniform values only have lane zero");    // If the value from the original loop has not been vectorized, it is    // represented by UF x VF scalar values in the new loop. Return the requested    // scalar value. -  if (VectorLoopValueMap.hasScalarValue(V, Part, Lane)) -    return VectorLoopValueMap.getScalarValue(V, Part, Lane); +  if (VectorLoopValueMap.hasScalarValue(V, Instance)) +    return VectorLoopValueMap.getScalarValue(V, Instance);    // If the value has not been scalarized, get its entry in VectorLoopValueMap    // for the given unroll part. If this entry is not a vector type (i.e., the    // vectorization factor is one), there is no need to generate an    // extractelement instruction. -  auto *U = getOrCreateVectorValue(V, Part); +  auto *U = getOrCreateVectorValue(V, Instance.Part);    if (!U->getType()->isVectorTy()) {      assert(VF == 1 && "Value not scalarized has non-vector type");      return U; @@ -2793,7 +2990,20 @@ Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,    // Otherwise, the value from the original loop has been vectorized and is    // represented by UF vector values. Extract and return the requested scalar    // value from the appropriate vector lane. -  return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); +  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); +} + +void InnerLoopVectorizer::packScalarIntoVectorValue( +    Value *V, const VPIteration &Instance) { +  assert(V != Induction && "The new induction variable should not be used."); +  assert(!V->getType()->isVectorTy() && "Can't pack a vector"); +  assert(!V->getType()->isVoidTy() && "Type does not produce a value"); + +  Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); +  Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); +  VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, +                                            Builder.getInt32(Instance.Lane)); +  VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);  }  Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -2843,6 +3053,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {    if (Instr != Group->getInsertPos())      return; +  const DataLayout &DL = Instr->getModule()->getDataLayout();    Value *Ptr = getPointerOperand(Instr);    // Prepare for the vector type of the interleaved load/store. @@ -2866,7 +3077,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {      Index += (VF - 1) * Group->getFactor();    for (unsigned Part = 0; Part < UF; Part++) { -    Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0); +    Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});      // Notice current instruction could be any index. Need to adjust the address      // to the member of index 0. @@ -2890,13 +3101,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {    // Vectorize the interleaved load group.    if (isa<LoadInst>(Instr)) { -      // For each unroll part, create a wide load for the group.      SmallVector<Value *, 2> NewLoads;      for (unsigned Part = 0; Part < UF; Part++) {        auto *NewLoad = Builder.CreateAlignedLoad(            NewPtrs[Part], Group->getAlignment(), "wide.vec"); -      addMetadata(NewLoad, Instr); +      Group->addMetadata(NewLoad);        NewLoads.push_back(NewLoad);      } @@ -2917,7 +3127,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {          // If this member has different type, cast the result type.          if (Member->getType() != ScalarTy) {            VectorType *OtherVTy = VectorType::get(Member->getType(), VF); -          StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); +          StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);          }          if (Group->isReverse()) @@ -2946,9 +3156,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {        if (Group->isReverse())          StoredVec = reverseVector(StoredVec); -      // If this member has different type, cast it to an unified type. +      // If this member has different type, cast it to a unified type. +        if (StoredVec->getType() != SubVT) -        StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT); +        StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);        StoredVecs.push_back(StoredVec);      } @@ -2963,11 +3174,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {      Instruction *NewStoreInstr =          Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); -    addMetadata(NewStoreInstr, Instr); + +    Group->addMetadata(NewStoreInstr);    }  } -void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { +void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, +                                                     VectorParts *BlockInMask) {    // Attempt to issue a wide load.    LoadInst *LI = dyn_cast<LoadInst>(Instr);    StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -2992,14 +3205,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {      Alignment = DL.getABITypeAlignment(ScalarDataTy);    unsigned AddressSpace = getMemInstAddressSpace(Instr); -  // Scalarize the memory instruction if necessary. -  if (Decision == LoopVectorizationCostModel::CM_Scalarize) -    return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); -    // Determine if the pointer operand of the access is either consecutive or    // reverse consecutive. -  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); -  bool Reverse = ConsecutiveStride < 0; +  bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); +  bool ConsecutiveStride = +      Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);    bool CreateGatherScatter =        (Decision == LoopVectorizationCostModel::CM_GatherScatter); @@ -3010,9 +3220,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {    // Handle consecutive loads/stores.    if (ConsecutiveStride) -    Ptr = getOrCreateScalarValue(Ptr, 0, 0); +    Ptr = getOrCreateScalarValue(Ptr, {0, 0}); + +  VectorParts Mask; +  bool isMaskRequired = BlockInMask; +  if (isMaskRequired) +    Mask = *BlockInMask; -  VectorParts Mask = createBlockInMask(Instr->getParent());    // Handle Stores:    if (SI) {      assert(!Legal->isUniform(SI->getPointerOperand()) && @@ -3023,7 +3237,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {        Instruction *NewSI = nullptr;        Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);        if (CreateGatherScatter) { -        Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr; +        Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;          Value *VectorGep = getOrCreateVectorValue(Ptr, Part);          NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,                                              MaskPart); @@ -3045,13 +3259,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {                Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));            PartPtr =                Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); -          Mask[Part] = reverseVector(Mask[Part]); +          if (isMaskRequired) // Reverse of a null all-one mask is a null mask. +            Mask[Part] = reverseVector(Mask[Part]);          }          Value *VecPtr =              Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); -        if (Legal->isMaskRequired(SI)) +        if (isMaskRequired)            NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,                                              Mask[Part]);          else @@ -3068,7 +3283,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {    for (unsigned Part = 0; Part < UF; ++Part) {      Value *NewLI;      if (CreateGatherScatter) { -      Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr; +      Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;        Value *VectorGep = getOrCreateVectorValue(Ptr, Part);        NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,                                           nullptr, "wide.masked.gather"); @@ -3083,12 +3298,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {          // wide load needs to start at the last vector element.          PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));          PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); -        Mask[Part] = reverseVector(Mask[Part]); +        if (isMaskRequired) // Reverse of a null all-one mask is a null mask. +          Mask[Part] = reverseVector(Mask[Part]);        }        Value *VecPtr =            Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); -      if (Legal->isMaskRequired(LI)) +      if (isMaskRequired)          NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],                                           UndefValue::get(DataTy),                                           "wide.masked.load"); @@ -3105,71 +3321,41 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {  }  void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, +                                               const VPIteration &Instance,                                                 bool IfPredicateInstr) {    assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); -  DEBUG(dbgs() << "LV: Scalarizing" -               << (IfPredicateInstr ? " and predicating:" : ":") << *Instr -               << '\n'); -  // Holds vector parameters or scalars, in case of uniform vals. -  SmallVector<VectorParts, 4> Params;    setDebugLocFromInst(Builder, Instr);    // Does this instruction return a value ?    bool IsVoidRetTy = Instr->getType()->isVoidTy(); -  VectorParts Cond; -  if (IfPredicateInstr) -    Cond = createBlockInMask(Instr->getParent()); - -  // Determine the number of scalars we need to generate for each unroll -  // iteration. If the instruction is uniform, we only need to generate the -  // first lane. Otherwise, we generate all VF values. -  unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF; - -  // For each vector unroll 'part': -  for (unsigned Part = 0; Part < UF; ++Part) { -    // For each scalar that we create: -    for (unsigned Lane = 0; Lane < Lanes; ++Lane) { +  Instruction *Cloned = Instr->clone(); +  if (!IsVoidRetTy) +    Cloned->setName(Instr->getName() + ".cloned"); -      // Start if-block. -      Value *Cmp = nullptr; -      if (IfPredicateInstr) { -        Cmp = Cond[Part]; -        if (Cmp->getType()->isVectorTy()) -          Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane)); -        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, -                                 ConstantInt::get(Cmp->getType(), 1)); -      } - -      Instruction *Cloned = Instr->clone(); -      if (!IsVoidRetTy) -        Cloned->setName(Instr->getName() + ".cloned"); - -      // Replace the operands of the cloned instructions with their scalar -      // equivalents in the new loop. -      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { -        auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane); -        Cloned->setOperand(op, NewOp); -      } -      addNewMetadata(Cloned, Instr); +  // Replace the operands of the cloned instructions with their scalar +  // equivalents in the new loop. +  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { +    auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); +    Cloned->setOperand(op, NewOp); +  } +  addNewMetadata(Cloned, Instr); -      // Place the cloned scalar in the new loop. -      Builder.Insert(Cloned); +  // Place the cloned scalar in the new loop. +  Builder.Insert(Cloned); -      // Add the cloned scalar to the scalar map entry. -      VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned); +  // Add the cloned scalar to the scalar map entry. +  VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); -      // If we just cloned a new assumption, add it the assumption cache. -      if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) -        if (II->getIntrinsicID() == Intrinsic::assume) -          AC->registerAssumption(II); +  // If we just cloned a new assumption, add it the assumption cache. +  if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) +    if (II->getIntrinsicID() == Intrinsic::assume) +      AC->registerAssumption(II); -      // End if-block. -      if (IfPredicateInstr) -        PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); -    } -  } +  // End if-block. +  if (IfPredicateInstr) +    PredicatedInstructions.push_back(Cloned);  }  PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -3281,6 +3467,36 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {    return VectorTripCount;  } +Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, +                                                   const DataLayout &DL) { +  // Verify that V is a vector type with same number of elements as DstVTy. +  unsigned VF = DstVTy->getNumElements(); +  VectorType *SrcVecTy = cast<VectorType>(V->getType()); +  assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); +  Type *SrcElemTy = SrcVecTy->getElementType(); +  Type *DstElemTy = DstVTy->getElementType(); +  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && +         "Vector elements must have same size"); + +  // Do a direct cast if element types are castable. +  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { +    return Builder.CreateBitOrPointerCast(V, DstVTy); +  } +  // V cannot be directly casted to desired vector type. +  // May happen when V is a floating point vector but DstVTy is a vector of +  // pointers or vice-versa. Handle this using a two-step bitcast using an +  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. +  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && +         "Only one type should be a pointer type"); +  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && +         "Only one type should be a floating point type"); +  Type *IntTy = +      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); +  VectorType *VecIntTy = VectorType::get(IntTy, VF); +  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); +  return Builder.CreateBitOrPointerCast(CastVal, DstVTy); +} +  void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,                                                           BasicBlock *Bypass) {    Value *Count = getOrCreateTripCount(L); @@ -3373,7 +3589,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {    LVer->prepareNoAliasMetadata();  } -void InnerLoopVectorizer::createVectorizedLoopSkeleton() { +BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {    /*     In this function we generate a new loop. The new loop will contain     the vectorized instructions while the old loop will continue to run the @@ -3435,7 +3651,7 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() {        MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");    // Create and register the new vector loop. -  Loop *Lp = new Loop(); +  Loop *Lp = LI->AllocateLoop();    Loop *ParentLoop = OrigLoop->getParentLoop();    // Insert the new loop into the loop nest and register the new basic blocks @@ -3554,6 +3770,8 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() {    LoopVectorizeHints Hints(Lp, true, *ORE);    Hints.setAlreadyVectorized(); + +  return LoopVectorPreHeader;  }  // Fix up external users of the induction variable. At this point, we are @@ -3622,22 +3840,27 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,  }  namespace { +  struct CSEDenseMapInfo {    static bool canHandle(const Instruction *I) {      return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||             isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);    } +    static inline Instruction *getEmptyKey() {      return DenseMapInfo<Instruction *>::getEmptyKey();    } +    static inline Instruction *getTombstoneKey() {      return DenseMapInfo<Instruction *>::getTombstoneKey();    } +    static unsigned getHashValue(const Instruction *I) {      assert(canHandle(I) && "Unknown instruction!");      return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),                                                             I->value_op_end()));    } +    static bool isEqual(const Instruction *LHS, const Instruction *RHS) {      if (LHS == getEmptyKey() || RHS == getEmptyKey() ||          LHS == getTombstoneKey() || RHS == getTombstoneKey()) @@ -3645,7 +3868,8 @@ struct CSEDenseMapInfo {      return LHS->isIdenticalTo(RHS);    }  }; -} + +} // end anonymous namespace  ///\brief Perform cse of induction variable instructions.  static void cse(BasicBlock *BB) { @@ -3777,7 +4001,6 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {    // For every instruction `I` in MinBWs, truncate the operands, create a    // truncated version of `I` and reextend its result. InstCombine runs    // later and will remove any ext/trunc pairs. -  //    SmallPtrSet<Value *, 4> Erased;    for (const auto &KV : Cost->getMinimalBitwidths()) {      // If the value wasn't vectorized, we must maintain the original scalar @@ -3927,7 +4150,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() {                   IVEndValues[Entry.first], LoopMiddleBlock);    fixLCSSAPHIs(); -  predicateInstructions(); +  for (Instruction *PI : PredicatedInstructions) +    sinkScalarOperands(&*PI);    // Remove redundant induction instructions.    cse(LoopVectorBody); @@ -3953,7 +4177,6 @@ void InnerLoopVectorizer::fixCrossIterationPHIs() {  }  void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { -    // This is the second phase of vectorizing first-order recurrences. An    // overview of the transformation is described below. Suppose we have the    // following loop. @@ -4211,7 +4434,8 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {    // entire expression in the smaller type.    if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {      Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); -    Builder.SetInsertPoint(LoopVectorBody->getTerminator()); +    Builder.SetInsertPoint( +        LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());      VectorParts RdxParts(UF);      for (unsigned Part = 0; Part < UF; ++Part) {        RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); @@ -4317,7 +4541,6 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {  }  void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { -    // The basic block and loop containing the predicated instruction.    auto *PredBB = PredInst->getParent();    auto *VectorLoop = LI->getLoopFor(PredBB); @@ -4346,7 +4569,6 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {    // through the worklist doesn't sink a single instruction.    bool Changed;    do { -      // Add the instructions that need to be reanalyzed to the worklist, and      // reset the changed indicator.      Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); @@ -4365,7 +4587,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {        // It's legal to sink the instruction if all its uses occur in the        // predicated block. Otherwise, there's nothing to do yet, and we may        // need to reanalyze the instruction. -      if (!all_of(I->uses(), isBlockOfUsePredicated)) { +      if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {          InstsToReanalyze.push_back(I);          continue;        } @@ -4382,200 +4604,11 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {    } while (Changed);  } -void InnerLoopVectorizer::predicateInstructions() { - -  // For each instruction I marked for predication on value C, split I into its -  // own basic block to form an if-then construct over C. Since I may be fed by -  // an extractelement instruction or other scalar operand, we try to -  // iteratively sink its scalar operands into the predicated block. If I feeds -  // an insertelement instruction, we try to move this instruction into the -  // predicated block as well. For non-void types, a phi node will be created -  // for the resulting value (either vector or scalar). -  // -  // So for some predicated instruction, e.g. the conditional sdiv in: -  // -  // for.body: -  //  ... -  //  %add = add nsw i32 %mul, %0 -  //  %cmp5 = icmp sgt i32 %2, 7 -  //  br i1 %cmp5, label %if.then, label %if.end -  // -  // if.then: -  //  %div = sdiv i32 %0, %1 -  //  br label %if.end -  // -  // if.end: -  //  %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ] -  // -  // the sdiv at this point is scalarized and if-converted using a select. -  // The inactive elements in the vector are not used, but the predicated -  // instruction is still executed for all vector elements, essentially: -  // -  // vector.body: -  //  ... -  //  %17 = add nsw <2 x i32> %16, %wide.load -  //  %29 = extractelement <2 x i32> %wide.load, i32 0 -  //  %30 = extractelement <2 x i32> %wide.load51, i32 0 -  //  %31 = sdiv i32 %29, %30 -  //  %32 = insertelement <2 x i32> undef, i32 %31, i32 0 -  //  %35 = extractelement <2 x i32> %wide.load, i32 1 -  //  %36 = extractelement <2 x i32> %wide.load51, i32 1 -  //  %37 = sdiv i32 %35, %36 -  //  %38 = insertelement <2 x i32> %32, i32 %37, i32 1 -  //  %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17 -  // -  // Predication will now re-introduce the original control flow to avoid false -  // side-effects by the sdiv instructions on the inactive elements, yielding -  // (after cleanup): -  // -  // vector.body: -  //  ... -  //  %5 = add nsw <2 x i32> %4, %wide.load -  //  %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7> -  //  %9 = extractelement <2 x i1> %8, i32 0 -  //  br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue -  // -  // pred.sdiv.if: -  //  %10 = extractelement <2 x i32> %wide.load, i32 0 -  //  %11 = extractelement <2 x i32> %wide.load51, i32 0 -  //  %12 = sdiv i32 %10, %11 -  //  %13 = insertelement <2 x i32> undef, i32 %12, i32 0 -  //  br label %pred.sdiv.continue -  // -  // pred.sdiv.continue: -  //  %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ] -  //  %15 = extractelement <2 x i1> %8, i32 1 -  //  br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55 -  // -  // pred.sdiv.if54: -  //  %16 = extractelement <2 x i32> %wide.load, i32 1 -  //  %17 = extractelement <2 x i32> %wide.load51, i32 1 -  //  %18 = sdiv i32 %16, %17 -  //  %19 = insertelement <2 x i32> %14, i32 %18, i32 1 -  //  br label %pred.sdiv.continue55 -  // -  // pred.sdiv.continue55: -  //  %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ] -  //  %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5 - -  for (auto KV : PredicatedInstructions) { -    BasicBlock::iterator I(KV.first); -    BasicBlock *Head = I->getParent(); -    auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, -                                        /*BranchWeights=*/nullptr, DT, LI); -    I->moveBefore(T); -    sinkScalarOperands(&*I); - -    BasicBlock *PredicatedBlock = I->getParent(); -    Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName(); -    PredicatedBlock->setName(BBNamePrefix + ".if"); -    PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue"); - -    // If the instruction is non-void create a Phi node at reconvergence point. -    if (!I->getType()->isVoidTy()) { -      Value *IncomingTrue = nullptr; -      Value *IncomingFalse = nullptr; - -      if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) { -        // If the predicated instruction is feeding an insert-element, move it -        // into the Then block; Phi node will be created for the vector. -        InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin()); -        IEI->moveBefore(T); -        IncomingTrue = IEI; // the new vector with the inserted element. -        IncomingFalse = IEI->getOperand(0); // the unmodified vector -      } else { -        // Phi node will be created for the scalar predicated instruction. -        IncomingTrue = &*I; -        IncomingFalse = UndefValue::get(I->getType()); -      } - -      BasicBlock *PostDom = I->getParent()->getSingleSuccessor(); -      assert(PostDom && "Then block has multiple successors"); -      PHINode *Phi = -          PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); -      IncomingTrue->replaceAllUsesWith(Phi); -      Phi->addIncoming(IncomingFalse, Head); -      Phi->addIncoming(IncomingTrue, I->getParent()); -    } -  } - -  DEBUG(DT->verifyDomTree()); -} - -InnerLoopVectorizer::VectorParts -InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { -  assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); - -  // Look for cached value. -  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); -  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); -  if (ECEntryIt != EdgeMaskCache.end()) -    return ECEntryIt->second; - -  VectorParts SrcMask = createBlockInMask(Src); - -  // The terminator has to be a branch inst! -  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); -  assert(BI && "Unexpected terminator found"); - -  if (BI->isConditional()) { - -    VectorParts EdgeMask(UF); -    for (unsigned Part = 0; Part < UF; ++Part) { -      auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part); -      if (BI->getSuccessor(0) != Dst) -        EdgeMaskPart = Builder.CreateNot(EdgeMaskPart); - -      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]); -      EdgeMask[Part] = EdgeMaskPart; -    } - -    EdgeMaskCache[Edge] = EdgeMask; -    return EdgeMask; -  } - -  EdgeMaskCache[Edge] = SrcMask; -  return SrcMask; -} - -InnerLoopVectorizer::VectorParts -InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { -  assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); - -  // Look for cached value. -  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); -  if (BCEntryIt != BlockMaskCache.end()) -    return BCEntryIt->second; - -  VectorParts BlockMask(UF); - -  // Loop incoming mask is all-one. -  if (OrigLoop->getHeader() == BB) { -    Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); -    for (unsigned Part = 0; Part < UF; ++Part) -      BlockMask[Part] = getOrCreateVectorValue(C, Part); -    BlockMaskCache[BB] = BlockMask; -    return BlockMask; -  } - -  // This is the block mask. We OR all incoming edges, and with zero. -  Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); -  for (unsigned Part = 0; Part < UF; ++Part) -    BlockMask[Part] = getOrCreateVectorValue(Zero, Part); - -  // For each pred: -  for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) { -    VectorParts EM = createEdgeMask(*It, BB); -    for (unsigned Part = 0; Part < UF; ++Part) -      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]); -  } - -  BlockMaskCache[BB] = BlockMask; -  return BlockMask; -} -  void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,                                                unsigned VF) { +  assert(PN->getParent() == OrigLoop->getHeader() && +         "Non-header phis should have been handled elsewhere"); +    PHINode *P = cast<PHINode>(PN);    // In order to support recurrences we need to be able to vectorize Phi nodes.    // Phi nodes have cycles, so we need to vectorize them in two stages. This is @@ -4594,43 +4627,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,    }    setDebugLocFromInst(Builder, P); -  // Check for PHI nodes that are lowered to vector selects. -  if (P->getParent() != OrigLoop->getHeader()) { -    // We know that all PHIs in non-header blocks are converted into -    // selects, so we don't have to worry about the insertion order and we -    // can just use the builder. -    // At this point we generate the predication tree. There may be -    // duplications since this is a simple recursive scan, but future -    // optimizations will clean it up. - -    unsigned NumIncoming = P->getNumIncomingValues(); - -    // Generate a sequence of selects of the form: -    // SELECT(Mask3, In3, -    //      SELECT(Mask2, In2, -    //                   ( ...))) -    VectorParts Entry(UF); -    for (unsigned In = 0; In < NumIncoming; In++) { -      VectorParts Cond = -          createEdgeMask(P->getIncomingBlock(In), P->getParent()); - -      for (unsigned Part = 0; Part < UF; ++Part) { -        Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part); -        // We might have single edge PHIs (blocks) - use an identity -        // 'select' for the first PHI operand. -        if (In == 0) -          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0); -        else -          // Select between the current value and the previous incoming edge -          // based on the incoming mask. -          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part], -                                             "predphi"); -      } -    } -    for (unsigned Part = 0; Part < UF; ++Part) -      VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]); -    return; -  }    // This PHINode must be an induction variable.    // Make sure that we know about it. @@ -4646,7 +4642,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,      llvm_unreachable("Unknown induction");    case InductionDescriptor::IK_IntInduction:    case InductionDescriptor::IK_FpInduction: -    return widenIntOrFpInduction(P); +    llvm_unreachable("Integer/fp induction is handled elsewhere.");    case InductionDescriptor::IK_PtrInduction: {      // Handle the pointer induction variable case.      assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4665,7 +4661,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);          Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);          SclrGep->setName("next.gep"); -        VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep); +        VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);        }      }      return; @@ -4691,25 +4687,11 @@ static bool mayDivideByZero(Instruction &I) {    return !CInt || CInt->isZero();  } -void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { -  // Scalarize instructions that should remain scalar after vectorization. -  if (VF > 1 && -      !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) && -      shouldScalarizeInstruction(&I)) { -    scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); -    return; -  } - +void InnerLoopVectorizer::widenInstruction(Instruction &I) {    switch (I.getOpcode()) {    case Instruction::Br: -    // Nothing to do for PHIs and BR, since we already took care of the -    // loop control flow instructions. -    break; -  case Instruction::PHI: { -    // Vectorize PHINodes. -    widenPHIInstruction(&I, UF, VF); -    break; -  } // End of PHI. +  case Instruction::PHI: +    llvm_unreachable("This instruction is handled by a different recipe.");    case Instruction::GetElementPtr: {      // Construct a vector GEP by widening the operands of the scalar GEP as      // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP @@ -4746,7 +4728,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {        // values in the vector mapping with initVector, as we do for other        // instructions.        for (unsigned Part = 0; Part < UF; ++Part) { -          // The pointer operand of the new GEP. If it's loop-invariant, we          // won't broadcast it.          auto *Ptr = @@ -4782,13 +4763,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {    case Instruction::SDiv:    case Instruction::SRem:    case Instruction::URem: -    // Scalarize with predication if this instruction may divide by zero and -    // block execution is conditional, otherwise fallthrough. -    if (Legal->isScalarWithPredication(&I)) { -      scalarizeInstruction(&I, true); -      break; -    } -    LLVM_FALLTHROUGH;    case Instruction::Add:    case Instruction::FAdd:    case Instruction::Sub: @@ -4836,7 +4810,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {      // We have to take the 'vectorized' value and pick the first lane.      // Instcombine will make this a no-op. -    auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0); +    auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});      for (unsigned Part = 0; Part < UF; ++Part) {        Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); @@ -4862,8 +4836,10 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {        Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);        Value *C = nullptr;        if (FCmp) { +        // Propagate fast math flags. +        IRBuilder<>::FastMathFlagGuard FMFG(Builder); +        Builder.setFastMathFlags(Cmp->getFastMathFlags());          C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); -        cast<FCmpInst>(C)->copyFastMathFlags(Cmp);        } else {          C = Builder.CreateICmp(Cmp->getPredicate(), A, B);        } @@ -4874,10 +4850,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {      break;    } -  case Instruction::Store: -  case Instruction::Load: -    vectorizeMemoryInstruction(&I); -    break;    case Instruction::ZExt:    case Instruction::SExt:    case Instruction::FPToUI: @@ -4893,16 +4865,6 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {      auto *CI = dyn_cast<CastInst>(&I);      setDebugLocFromInst(Builder, CI); -    // Optimize the special case where the source is a constant integer -    // induction variable. Notice that we can only optimize the 'trunc' case -    // because (a) FP conversions lose precision, (b) sext/zext may wrap, and -    // (c) other casts depend on pointer size. -    if (Cost->isOptimizableIVTruncate(CI, VF)) { -      widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)), -                            cast<TruncInst>(CI)); -      break; -    } -      /// Vectorize casts.      Type *DestTy =          (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); @@ -4933,11 +4895,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {        Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); -    if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || -               ID == Intrinsic::lifetime_start)) { -      scalarizeInstruction(&I); -      break; -    } +      // The flag shows whether we use Intrinsic or a usual Call for vectorized      // version of the instruction.      // Is it beneficial to perform intrinsic call compared to lib call? @@ -4945,10 +4903,8 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {      unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);      bool UseVectorIntrinsic =          ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; -    if (!UseVectorIntrinsic && NeedToScalarize) { -      scalarizeInstruction(&I); -      break; -    } +    assert((UseVectorIntrinsic || !NeedToScalarize) && +           "Instruction should be scalarized elsewhere.");      for (unsigned Part = 0; Part < UF; ++Part) {        SmallVector<Value *, 4> Args; @@ -4998,9 +4954,9 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {    }    default: -    // All other instructions are unsupported. Scalarize them. -    scalarizeInstruction(&I); -    break; +    // This instruction is not vectorized by simple widening. +    DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); +    llvm_unreachable("Unhandled instruction!");    } // end of switch.  } @@ -5012,14 +4968,11 @@ void InnerLoopVectorizer::updateAnalysis() {    assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&           "Entry does not dominate exit."); -  DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(), -                  LoopVectorPreHeader);    DT->addNewBlock(LoopMiddleBlock,                    LI->getLoopFor(LoopVectorBody)->getLoopLatch());    DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);    DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);    DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); -    DEBUG(DT->verifyDomTree());  } @@ -5094,12 +5047,15 @@ bool LoopVectorizationLegality::canVectorize() {    // Store the result and return it at the end instead of exiting early, in case    // allowExtraAnalysis is used to report multiple reasons for not vectorizing.    bool Result = true; +   +  bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); +  if (DoExtraAnalysis)    // We must have a loop in canonical form. Loops with indirectbr in them cannot    // be canonicalized.    if (!TheLoop->getLoopPreheader()) {      ORE->emit(createMissedAnalysis("CFGNotUnderstood")                << "loop control flow is not understood by vectorizer"); -    if (ORE->allowExtraAnalysis()) +  if (DoExtraAnalysis)        Result = false;      else        return false; @@ -5112,7 +5068,7 @@ bool LoopVectorizationLegality::canVectorize() {    if (!TheLoop->empty()) {      ORE->emit(createMissedAnalysis("NotInnermostLoop")                << "loop is not the innermost loop"); -    if (ORE->allowExtraAnalysis()) +    if (DoExtraAnalysis)        Result = false;      else        return false; @@ -5122,7 +5078,7 @@ bool LoopVectorizationLegality::canVectorize() {    if (TheLoop->getNumBackEdges() != 1) {      ORE->emit(createMissedAnalysis("CFGNotUnderstood")                << "loop control flow is not understood by vectorizer"); -    if (ORE->allowExtraAnalysis()) +    if (DoExtraAnalysis)        Result = false;      else        return false; @@ -5132,7 +5088,7 @@ bool LoopVectorizationLegality::canVectorize() {    if (!TheLoop->getExitingBlock()) {      ORE->emit(createMissedAnalysis("CFGNotUnderstood")                << "loop control flow is not understood by vectorizer"); -    if (ORE->allowExtraAnalysis()) +    if (DoExtraAnalysis)        Result = false;      else        return false; @@ -5144,7 +5100,7 @@ bool LoopVectorizationLegality::canVectorize() {    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {      ORE->emit(createMissedAnalysis("CFGNotUnderstood")                << "loop control flow is not understood by vectorizer"); -    if (ORE->allowExtraAnalysis()) +    if (DoExtraAnalysis)        Result = false;      else        return false; @@ -5158,7 +5114,7 @@ bool LoopVectorizationLegality::canVectorize() {    unsigned NumBlocks = TheLoop->getNumBlocks();    if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {      DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); -    if (ORE->allowExtraAnalysis()) +    if (DoExtraAnalysis)        Result = false;      else        return false; @@ -5167,7 +5123,7 @@ bool LoopVectorizationLegality::canVectorize() {    // Check if we can vectorize the instructions and CFG in this loop.    if (!canVectorizeInstrs()) {      DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); -    if (ORE->allowExtraAnalysis()) +    if (DoExtraAnalysis)        Result = false;      else        return false; @@ -5176,7 +5132,7 @@ bool LoopVectorizationLegality::canVectorize() {    // Go over each instruction and look at memory deps.    if (!canVectorizeMemory()) {      DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); -    if (ORE->allowExtraAnalysis()) +    if (DoExtraAnalysis)        Result = false;      else        return false; @@ -5207,7 +5163,7 @@ bool LoopVectorizationLegality::canVectorize() {                << "Too many SCEV assumptions need to be made and checked "                << "at runtime");      DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); -    if (ORE->allowExtraAnalysis()) +    if (DoExtraAnalysis)        Result = false;      else        return false; @@ -5263,6 +5219,15 @@ void LoopVectorizationLegality::addInductionPhi(      PHINode *Phi, const InductionDescriptor &ID,      SmallPtrSetImpl<Value *> &AllowedExit) {    Inductions[Phi] = ID; + +  // In case this induction also comes with casts that we know we can ignore +  // in the vectorized loop body, record them here. All casts could be recorded +  // here for ignoring, but suffices to record only the first (as it is the +  // only one that may bw used outside the cast sequence). +  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); +  if (!Casts.empty()) +    InductionCastsToIgnore.insert(*Casts.begin()); +    Type *PhiTy = Phi->getType();    const DataLayout &DL = Phi->getModule()->getDataLayout(); @@ -5300,7 +5265,6 @@ void LoopVectorizationLegality::addInductionPhi(    }    DEBUG(dbgs() << "LV: Found an induction variable.\n"); -  return;  }  bool LoopVectorizationLegality::canVectorizeInstrs() { @@ -5439,7 +5403,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {          // operations, shuffles, or casts, as they don't change precision or          // semantics.        } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && -                 !I.hasUnsafeAlgebra()) { +                 !I.isFast()) {          DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");          Hints->setPotentiallyUnsafe();        } @@ -5451,7 +5415,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {                    << "value cannot be used outside the loop");          return false;        } -      } // next instr.    } @@ -5474,7 +5437,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {  }  void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { -    // We should not collect Scalars more than once per VF. Right now, this    // function is called from collectUniformsAndScalars(), which already does    // this check. Collecting Scalars for VF=1 does not make any sense. @@ -5517,7 +5479,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {    // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in    // PossibleNonScalarPtrs.    auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { -      // We only care about bitcast and getelementptr instructions contained in      // the loop.      if (!isLoopVaryingBitCastOrGEP(Ptr)) @@ -5532,7 +5493,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {      // If the use of the pointer will be a scalar use, and all users of the      // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,      // place the pointer in PossibleNonScalarPtrs. -    if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) { +    if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {            return isa<LoadInst>(U) || isa<StoreInst>(U);          }))        ScalarPtrs.insert(I); @@ -5604,7 +5565,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {      if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))        continue;      auto *Src = cast<Instruction>(Dst->getOperand(0)); -    if (all_of(Src->users(), [&](User *U) -> bool { +    if (llvm::all_of(Src->users(), [&](User *U) -> bool {            auto *J = cast<Instruction>(U);            return !TheLoop->contains(J) || Worklist.count(J) ||                   ((isa<LoadInst>(J) || isa<StoreInst>(J)) && @@ -5631,7 +5592,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {      // Determine if all users of the induction variable are scalar after      // vectorization. -    auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { +    auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {        auto *I = cast<Instruction>(U);        return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);      }); @@ -5640,10 +5601,11 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {      // Determine if all users of the induction variable update instruction are      // scalar after vectorization. -    auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { -      auto *I = cast<Instruction>(U); -      return I == Ind || !TheLoop->contains(I) || Worklist.count(I); -    }); +    auto ScalarIndUpdate = +        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { +          auto *I = cast<Instruction>(U); +          return I == Ind || !TheLoop->contains(I) || Worklist.count(I); +        });      if (!ScalarIndUpdate)        continue; @@ -5703,7 +5665,6 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,  }  void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { -    // We should not collect Uniforms more than once per VF. Right now,    // this function is called from collectUniformsAndScalars(), which    // already does this check. Collecting Uniforms for VF=1 does not make any @@ -5754,6 +5715,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {             "Widening decision should be ready at this moment");      return (WideningDecision == CM_Widen || +            WideningDecision == CM_Widen_Reverse ||              WideningDecision == CM_Interleave);    };    // Iterate over the instructions in the loop, and collect all @@ -5766,7 +5728,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {    // the getelementptr won't remain uniform.    for (auto *BB : TheLoop->blocks())      for (auto &I : *BB) { -        // If there's no pointer operand, there's nothing to do.        auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));        if (!Ptr) @@ -5774,9 +5735,10 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {        // True if all users of Ptr are memory accesses that have Ptr as their        // pointer operand. -      auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool { -        return getPointerOperand(U) == Ptr; -      }); +      auto UsersAreMemAccesses = +          llvm::all_of(Ptr->users(), [&](User *U) -> bool { +            return getPointerOperand(U) == Ptr; +          });        // Ensure the memory instruction will not be scalarized or used by        // gather/scatter, making its pointer operand non-uniform. If the pointer @@ -5812,7 +5774,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {        if (isOutOfScope(OV))          continue;        auto *OI = cast<Instruction>(OV); -      if (all_of(OI->users(), [&](User *U) -> bool { +      if (llvm::all_of(OI->users(), [&](User *U) -> bool {              auto *J = cast<Instruction>(U);              return !TheLoop->contains(J) || Worklist.count(J) ||                     (OI == getPointerOperand(J) && isUniformDecision(J, VF)); @@ -5841,7 +5803,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {      // Determine if all users of the induction variable are uniform after      // vectorization. -    auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool { +    auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {        auto *I = cast<Instruction>(U);        return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||               isVectorizedMemAccessUse(I, Ind); @@ -5851,11 +5813,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {      // Determine if all users of the induction variable update instruction are      // uniform after vectorization. -    auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { -      auto *I = cast<Instruction>(U); -      return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || -             isVectorizedMemAccessUse(I, IndUpdate); -    }); +    auto UniformIndUpdate = +        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { +          auto *I = cast<Instruction>(U); +          return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || +                 isVectorizedMemAccessUse(I, IndUpdate); +        });      if (!UniformIndUpdate)        continue; @@ -5874,9 +5837,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() {    InterleaveInfo.setLAI(LAI);    const OptimizationRemarkAnalysis *LAR = LAI->getReport();    if (LAR) { -    OptimizationRemarkAnalysis VR(Hints->vectorizeAnalysisPassName(), -                                  "loop not vectorized: ", *LAR); -    ORE->emit(VR); +    ORE->emit([&]() { +      return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), +                                        "loop not vectorized: ", *LAR); +    });    }    if (!LAI->canVectorizeMemory())      return false; @@ -5894,7 +5858,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {    return true;  } -bool LoopVectorizationLegality::isInductionVariable(const Value *V) { +bool LoopVectorizationLegality::isInductionPhi(const Value *V) {    Value *In0 = const_cast<Value *>(V);    PHINode *PN = dyn_cast_or_null<PHINode>(In0);    if (!PN) @@ -5903,6 +5867,15 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) {    return Inductions.count(PN);  } +bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { +  auto *Inst = dyn_cast<Instruction>(V); +  return (Inst && InductionCastsToIgnore.count(Inst)); +} + +bool LoopVectorizationLegality::isInductionVariable(const Value *V) { +  return isInductionPhi(V) || isCastedInductionVariable(V); +} +  bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {    return FirstOrderRecurrences.count(Phi);  } @@ -5972,7 +5945,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(  void InterleavedAccessInfo::collectConstStrideAccesses(      MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,      const ValueToValueMap &Strides) { -    auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();    // Since it's desired that the load/store instructions be maintained in @@ -6126,7 +6098,6 @@ void InterleavedAccessInfo::analyzeInterleaving(        // but not with (4). If we did, the dependent access (3) would be within        // the boundaries of the (2, 4) group.        if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) { -          // If a dependence exists and A is already in a group, we know that A          // must be a store since A precedes B and WAR dependences are allowed.          // Thus, A would be sunk below B. We release A's group to prevent this @@ -6205,9 +6176,11 @@ void InterleavedAccessInfo::analyzeInterleaving(    // Remove interleaved store groups with gaps.    for (InterleaveGroup *Group : StoreGroups) -    if (Group->getNumMembers() != Group->getFactor()) +    if (Group->getNumMembers() != Group->getFactor()) { +      DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due " +                      "to gaps.\n");        releaseGroup(Group); - +    }    // Remove interleaved groups with gaps (currently only loads) whose memory    // accesses may wrap around. We have to revisit the getPtrStride analysis,    // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does @@ -6222,9 +6195,7 @@ void InterleavedAccessInfo::analyzeInterleaving(    // This means that we can forcefully peel the loop in order to only have to    // check the first pointer for no-wrap. When we'll change to use Assume=true    // we'll only need at most one runtime check per interleaved group. -  //    for (InterleaveGroup *Group : LoadGroups) { -      // Case 1: A full group. Can Skip the checks; For full groups, if the wide      // load would wrap around the address space we would do a memory access at      // nullptr even without the transformation. @@ -6260,6 +6231,8 @@ void InterleavedAccessInfo::analyzeInterleaving(        // to look for a member at index factor - 1, since every group must have        // a member at index zero.        if (Group->isReverse()) { +        DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " +                        "a reverse access with gaps.\n");          releaseGroup(Group);          continue;        } @@ -6277,8 +6250,21 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {      return None;    } +  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { +    // TODO: It may by useful to do since it's still likely to be dynamically +    // uniform if the target can skip. +    DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target"); + +    ORE->emit( +      createMissedAnalysis("CantVersionLoopWithDivergentTarget") +      << "runtime pointer checks needed. Not enabled for divergent target"); + +    return None; +  } + +  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);    if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. -    return computeFeasibleMaxVF(OptForSize); +    return computeFeasibleMaxVF(OptForSize, TC);    if (Legal->getRuntimePointerChecking()->Need) {      ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") @@ -6291,7 +6277,6 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {    }    // If we optimize the program for size, avoid creating the tail loop. -  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);    DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');    // If we don't know the precise trip count, don't try to vectorize. @@ -6303,7 +6288,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {      return None;    } -  unsigned MaxVF = computeFeasibleMaxVF(OptForSize); +  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);    if (TC % MaxVF != 0) {      // If the trip count that we found modulo the vectorization factor is not @@ -6324,46 +6309,52 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {    return MaxVF;  } -unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) { +unsigned +LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, +                                                 unsigned ConstTripCount) {    MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);    unsigned SmallestType, WidestType;    std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();    unsigned WidestRegister = TTI.getRegisterBitWidth(true); -  unsigned MaxSafeDepDist = -1U; -  // Get the maximum safe dependence distance in bits computed by LAA. If the -  // loop contains any interleaved accesses, we divide the dependence distance -  // by the maximum interleave factor of all interleaved groups. Note that -  // although the division ensures correctness, this is a fairly conservative -  // computation because the maximum distance computed by LAA may not involve -  // any of the interleaved accesses. -  if (Legal->getMaxSafeDepDistBytes() != -1U) -    MaxSafeDepDist = -        Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor(); +  // Get the maximum safe dependence distance in bits computed by LAA. +  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from +  // the memory accesses that is most restrictive (involved in the smallest +  // dependence distance). +  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); + +  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); -  WidestRegister = -      ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist);    unsigned MaxVectorSize = WidestRegister / WidestType;    DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "                 << WidestType << " bits.\n"); -  DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister +  DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister                 << " bits.\n"); +  assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" +                                " into one vector!");    if (MaxVectorSize == 0) {      DEBUG(dbgs() << "LV: The target has no vector registers.\n");      MaxVectorSize = 1; +    return MaxVectorSize; +  } else if (ConstTripCount && ConstTripCount < MaxVectorSize && +             isPowerOf2_32(ConstTripCount)) { +    // We need to clamp the VF to be the ConstTripCount. There is no point in +    // choosing a higher viable VF as done in the loop below. +    DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " +                 << ConstTripCount << "\n"); +    MaxVectorSize = ConstTripCount; +    return MaxVectorSize;    } -  assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" -                                " into one vector!"); -    unsigned MaxVF = MaxVectorSize;    if (MaximizeBandwidth && !OptForSize) { -    // Collect all viable vectorization factors. +    // Collect all viable vectorization factors larger than the default MaxVF +    // (i.e. MaxVectorSize).      SmallVector<unsigned, 8> VFs;      unsigned NewMaxVectorSize = WidestRegister / SmallestType; -    for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) +    for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)        VFs.push_back(VS);      // For each VF calculate its register usage. @@ -6485,7 +6476,6 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {  unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,                                                             unsigned VF,                                                             unsigned LoopCost) { -    // -- The interleave heuristics --    // We interleave the loop in order to expose ILP and reduce the loop overhead.    // There are many micro-architectural considerations that we can't predict @@ -6573,7 +6563,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,    // Interleave if we vectorized this loop and there is a reduction that could    // benefit from interleaving. -  if (VF > 1 && Legal->getReductionVars()->size()) { +  if (VF > 1 && !Legal->getReductionVars()->empty()) {      DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");      return IC;    } @@ -6604,7 +6594,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,      // by this point), we can increase the critical path length if the loop      // we're interleaving is inside another loop. Limit, by default to 2, so the      // critical path only gets increased by one reduction operation. -    if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) { +    if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {        unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);        SmallIC = std::min(SmallIC, F);        StoresIC = std::min(StoresIC, F); @@ -6623,7 +6613,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,    // Interleave if this is a large loop (small loops are already dealt with by    // this point) that could benefit from interleaving. -  bool HasReductions = (Legal->getReductionVars()->size() > 0); +  bool HasReductions = !Legal->getReductionVars()->empty();    if (TTI.enableAggressiveInterleaving(HasReductions)) {      DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");      return IC; @@ -6661,7 +6651,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {    // Each 'key' in the map opens a new interval. The values    // of the map are the index of the 'last seen' usage of the    // instruction that is the key. -  typedef DenseMap<Instruction *, unsigned> IntervalMap; +  using IntervalMap = DenseMap<Instruction *, unsigned>; +    // Maps instruction to its index.    DenseMap<unsigned, Instruction *> IdxToInstr;    // Marks the end of each interval. @@ -6700,7 +6691,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {    }    // Saves the list of intervals that end with the index in 'key'. -  typedef SmallVector<Instruction *, 2> InstrList; +  using InstrList = SmallVector<Instruction *, 2>;    DenseMap<unsigned, InstrList> TransposeEnds;    // Transpose the EndPoints to a list of values that end at each index. @@ -6795,7 +6786,6 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {  }  void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { -    // If we aren't vectorizing the loop, or if we've already collected the    // instructions to scalarize, there's nothing to do. Collection may already    // have occurred if we have a user-selected VF and are now computing the @@ -6829,7 +6819,6 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {  int LoopVectorizationCostModel::computePredInstDiscount(      Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,      unsigned VF) { -    assert(!isUniformAfterVectorization(PredInst, VF) &&           "Instruction marked uniform-after-vectorization will be predicated"); @@ -6844,7 +6833,6 @@ int LoopVectorizationCostModel::computePredInstDiscount(    // Returns true if the given instruction can be scalarized.    auto canBeScalarized = [&](Instruction *I) -> bool { -      // We only attempt to scalarize instructions forming a single-use chain      // from the original predicated block that would otherwise be vectorized.      // Although not strictly necessary, we give up on instructions we know will @@ -6947,13 +6935,6 @@ LoopVectorizationCostModel::VectorizationCostTy  LoopVectorizationCostModel::expectedCost(unsigned VF) {    VectorizationCostTy Cost; -  // Collect Uniform and Scalar instructions after vectorization with VF. -  collectUniformsAndScalars(VF); - -  // Collect the instructions (and their associated costs) that will be more -  // profitable to scalarize. -  collectInstsToScalarize(VF); -    // For each block.    for (BasicBlock *BB : TheLoop->blocks()) {      VectorizationCostTy BlockCost; @@ -6965,7 +6946,8 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {          continue;        // Skip ignored values. -      if (ValuesToIgnore.count(&I)) +      if (ValuesToIgnore.count(&I) || +          (VF > 1 && VecValuesToIgnore.count(&I)))          continue;        VectorizationCostTy C = getInstructionCost(&I, VF); @@ -7004,14 +6986,16 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {  static const SCEV *getAddressAccessSCEV(                Value *Ptr,                LoopVectorizationLegality *Legal, -              ScalarEvolution *SE, +              PredicatedScalarEvolution &PSE,                const Loop *TheLoop) { +    auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);    if (!Gep)      return nullptr;    // We are looking for a gep with all loop invariant indices except for one    // which should be an induction variable. +  auto SE = PSE.getSE();    unsigned NumOperands = Gep->getNumOperands();    for (unsigned i = 1; i < NumOperands; ++i) {      Value *Opd = Gep->getOperand(i); @@ -7021,7 +7005,7 @@ static const SCEV *getAddressAccessSCEV(    }    // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. -  return SE->getSCEV(Ptr); +  return PSE.getSCEV(Ptr);  }  static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { @@ -7041,7 +7025,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,    // Figure out whether the access is strided and get the stride value    // if it's known in compile time -  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); +  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);    // Get the cost of the scalar memory instruction and address computation.    unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); @@ -7145,7 +7129,6 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,  unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,                                                                unsigned VF) { -    // Calculate scalar cost only. Vectorization cost should be ready at this    // moment.    if (VF == 1) { @@ -7202,12 +7185,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {        // We assume that widening is the best solution when possible.        if (Legal->memoryInstructionCanBeWidened(&I, VF)) {          unsigned Cost = getConsecutiveMemOpCost(&I, VF); -        setWideningDecision(&I, VF, CM_Widen, Cost); +        int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I)); +        assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && +               "Expected consecutive stride."); +        InstWidening Decision = +            ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; +        setWideningDecision(&I, VF, Decision, Cost);          continue;        }        // Choose between Interleaving, Gather/Scatter or Scalarization. -      unsigned InterleaveCost = UINT_MAX; +      unsigned InterleaveCost = std::numeric_limits<unsigned>::max();        unsigned NumAccesses = 1;        if (Legal->isAccessInterleaved(&I)) {          auto Group = Legal->getInterleavedAccessGroup(&I); @@ -7224,7 +7212,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {        unsigned GatherScatterCost =            Legal->isLegalGatherOrScatter(&I)                ? getGatherScatterCost(&I, VF) * NumAccesses -              : UINT_MAX; +              : std::numeric_limits<unsigned>::max();        unsigned ScalarizationCost =            getMemInstScalarizationCost(&I, VF) * NumAccesses; @@ -7282,7 +7270,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {      for (auto &Op : I->operands())        if (auto *InstOp = dyn_cast<Instruction>(Op))          if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && -            AddrDefs.insert(InstOp).second == true) +            AddrDefs.insert(InstOp).second)            Worklist.push_back(InstOp);    } @@ -7292,7 +7280,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {        // by cost functions, but since this involves the task of finding out        // if the loaded register is involved in an address computation, it is        // instead changed here when we know this is the case. -      if (getWideningDecision(I, VF) == CM_Widen) +      InstWidening Decision = getWideningDecision(I, VF); +      if (Decision == CM_Widen || Decision == CM_Widen_Reverse)          // Scalarize a widened load of address.          setWideningDecision(I, VF, CM_Scalarize,                              (VF * getMemoryInstructionCost(I, 1))); @@ -7551,7 +7540,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,  }  char LoopVectorize::ID = 0; +  static const char lv_name[] = "Loop Vectorization"; +  INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) @@ -7568,13 +7559,14 @@ INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)  INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)  namespace llvm { +  Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {    return new LoopVectorize(NoUnrolling, AlwaysVectorize);  } -} -bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { +} // end namespace llvm +bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {    // Check if the pointer operand of a load or store instruction is    // consecutive.    if (auto *Ptr = getPointerOperand(Inst)) @@ -7593,11 +7585,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {      SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();      VecValuesToIgnore.insert(Casts.begin(), Casts.end());    } +  // Ignore type-casting instructions we identified during induction +  // detection. +  for (auto &Induction : *Legal->getInductionVars()) { +    InductionDescriptor &IndDes = Induction.second; +    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); +    VecValuesToIgnore.insert(Casts.begin(), Casts.end()); +  }  }  LoopVectorizationCostModel::VectorizationFactor  LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { -    // Width 1 means no vectorize, cost 0 means uncomputed cost.    const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,                                                                             0U}; @@ -7611,11 +7609,26 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {      // Collect the instructions (and their associated costs) that will be more      // profitable to scalarize.      CM.selectUserVectorizationFactor(UserVF); +    buildVPlans(UserVF, UserVF); +    DEBUG(printPlans(dbgs()));      return {UserVF, 0};    }    unsigned MaxVF = MaybeMaxVF.getValue();    assert(MaxVF != 0 && "MaxVF is zero."); + +  for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { +    // Collect Uniform and Scalar instructions after vectorization with VF. +    CM.collectUniformsAndScalars(VF); + +    // Collect the instructions (and their associated costs) that will be more +    // profitable to scalarize. +    if (VF > 1) +      CM.collectInstsToScalarize(VF); +  } + +  buildVPlans(1, MaxVF); +  DEBUG(printPlans(dbgs()));    if (MaxVF == 1)      return NoVectorization; @@ -7623,11 +7636,28 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {    return CM.selectVectorizationFactor(MaxVF);  } -void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { +void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { +  DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); +  BestVF = VF; +  BestUF = UF; + +  erase_if(VPlans, [VF](const VPlanPtr &Plan) { +    return !Plan->hasVF(VF); +  }); +  assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); +} + +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, +                                           DominatorTree *DT) {    // Perform the actual loop transformation.    // 1. Create a new empty loop. Unlink the old loop and connect the new one. -  ILV.createVectorizedLoopSkeleton(); +  VPCallbackILV CallbackILV(ILV); + +  VPTransformState State{BestVF, BestUF,      LI, +                         DT,     ILV.Builder, ILV.VectorLoopValueMap, +                         &ILV,   CallbackILV}; +  State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();    //===------------------------------------------------===//    // @@ -7638,36 +7668,8 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {    //===------------------------------------------------===//    // 2. Copy and widen instructions from the old loop into the new loop. - -  // Move instructions to handle first-order recurrences. -  DenseMap<Instruction *, Instruction *> SinkAfter = Legal->getSinkAfter(); -  for (auto &Entry : SinkAfter) { -    Entry.first->removeFromParent(); -    Entry.first->insertAfter(Entry.second); -    DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second -                 << " to vectorize a 1st order recurrence.\n"); -  } - -  // Collect instructions from the original loop that will become trivially dead -  // in the vectorized loop. We don't need to vectorize these instructions. For -  // example, original induction update instructions can become dead because we -  // separately emit induction "steps" when generating code for the new loop. -  // Similarly, we create a new latch condition when setting up the structure -  // of the new loop, so the old one can become dead. -  SmallPtrSet<Instruction *, 4> DeadInstructions; -  collectTriviallyDeadInstructions(DeadInstructions); - -  // Scan the loop in a topological order to ensure that defs are vectorized -  // before users. -  LoopBlocksDFS DFS(OrigLoop); -  DFS.perform(LI); - -  // Vectorize all instructions in the original loop that will not become -  // trivially dead when vectorized. -  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) -    for (Instruction &I : *BB) -      if (!DeadInstructions.count(&I)) -        ILV.vectorizeInstruction(I); +  assert(VPlans.size() == 1 && "Not a single VPlan to execute."); +  VPlans.front()->execute(&State);    // 3. Fix the vectorized code: take care of header phi's, live-outs,    //    predication, updating analyses. @@ -7691,18 +7693,23 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(    for (auto &Induction : *Legal->getInductionVars()) {      PHINode *Ind = Induction.first;      auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); -    if (all_of(IndUpdate->users(), [&](User *U) -> bool { +    if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {            return U == Ind || DeadInstructions.count(cast<Instruction>(U));          }))        DeadInstructions.insert(IndUpdate); -  } -} - -void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { -  auto *SI = dyn_cast<StoreInst>(Instr); -  bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); -  return scalarizeInstruction(Instr, IfPredicateInstr); +    // We record as "Dead" also the type-casting instructions we had identified  +    // during induction analysis. We don't need any handling for them in the +    // vectorized loop because we have proven that, under a proper runtime  +    // test guarding the vectorized loop, the value of the phi, and the casted  +    // value of the phi, are the same. The last instruction in this casting chain +    // will get its scalar/vector/widened def from the scalar/vector/widened def  +    // of the respective phi node. Any other casts in the induction def-use chain +    // have no other uses outside the phi update chain, and will be ignored. +    InductionDescriptor &IndDes = Induction.second; +    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); +    DeadInstructions.insert(Casts.begin(), Casts.end()); +  }  }  Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } @@ -7760,6 +7767,722 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {    }  } +bool LoopVectorizationPlanner::getDecisionAndClampRange( +    const std::function<bool(unsigned)> &Predicate, VFRange &Range) { +  assert(Range.End > Range.Start && "Trying to test an empty VF range."); +  bool PredicateAtRangeStart = Predicate(Range.Start); + +  for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) +    if (Predicate(TmpVF) != PredicateAtRangeStart) { +      Range.End = TmpVF; +      break; +    } + +  return PredicateAtRangeStart; +} + +/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, +/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range +/// of VF's starting at a given VF and extending it as much as possible. Each +/// vectorization decision can potentially shorten this sub-range during +/// buildVPlan(). +void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { + +  // Collect conditions feeding internal conditional branches; they need to be +  // represented in VPlan for it to model masking. +  SmallPtrSet<Value *, 1> NeedDef; + +  auto *Latch = OrigLoop->getLoopLatch(); +  for (BasicBlock *BB : OrigLoop->blocks()) { +    if (BB == Latch) +      continue; +    BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); +    if (Branch && Branch->isConditional()) +      NeedDef.insert(Branch->getCondition()); +  } + +  for (unsigned VF = MinVF; VF < MaxVF + 1;) { +    VFRange SubRange = {VF, MaxVF + 1}; +    VPlans.push_back(buildVPlan(SubRange, NeedDef)); +    VF = SubRange.End; +  } +} + +VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src, +                                                  BasicBlock *Dst, +                                                  VPlanPtr &Plan) { +  assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); + +  // Look for cached value. +  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); +  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); +  if (ECEntryIt != EdgeMaskCache.end()) +    return ECEntryIt->second; + +  VPValue *SrcMask = createBlockInMask(Src, Plan); + +  // The terminator has to be a branch inst! +  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); +  assert(BI && "Unexpected terminator found"); + +  if (!BI->isConditional()) +    return EdgeMaskCache[Edge] = SrcMask; + +  VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); +  assert(EdgeMask && "No Edge Mask found for condition"); + +  if (BI->getSuccessor(0) != Dst) +    EdgeMask = Builder.createNot(EdgeMask); + +  if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. +    EdgeMask = Builder.createAnd(EdgeMask, SrcMask); + +  return EdgeMaskCache[Edge] = EdgeMask; +} + +VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB, +                                                     VPlanPtr &Plan) { +  assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + +  // Look for cached value. +  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); +  if (BCEntryIt != BlockMaskCache.end()) +    return BCEntryIt->second; + +  // All-one mask is modelled as no-mask following the convention for masked +  // load/store/gather/scatter. Initialize BlockMask to no-mask. +  VPValue *BlockMask = nullptr; + +  // Loop incoming mask is all-one. +  if (OrigLoop->getHeader() == BB) +    return BlockMaskCache[BB] = BlockMask; + +  // This is the block mask. We OR all incoming edges. +  for (auto *Predecessor : predecessors(BB)) { +    VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); +    if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. +      return BlockMaskCache[BB] = EdgeMask; + +    if (!BlockMask) { // BlockMask has its initialized nullptr value. +      BlockMask = EdgeMask; +      continue; +    } + +    BlockMask = Builder.createOr(BlockMask, EdgeMask); +  } + +  return BlockMaskCache[BB] = BlockMask; +} + +VPInterleaveRecipe * +LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, +                                                VFRange &Range) { +  const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I); +  if (!IG) +    return nullptr; + +  // Now check if IG is relevant for VF's in the given range. +  auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { +    return [=](unsigned VF) -> bool { +      return (VF >= 2 && // Query is illegal for VF == 1 +              CM.getWideningDecision(I, VF) == +                  LoopVectorizationCostModel::CM_Interleave); +    }; +  }; +  if (!getDecisionAndClampRange(isIGMember(I), Range)) +    return nullptr; + +  // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) +  // range. If it's the primary member of the IG construct a VPInterleaveRecipe. +  // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. +  assert(I == IG->getInsertPos() && +         "Generating a recipe for an adjunct member of an interleave group"); + +  return new VPInterleaveRecipe(IG); +} + +VPWidenMemoryInstructionRecipe * +LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range, +                                           VPlanPtr &Plan) { +  if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) +    return nullptr; + +  auto willWiden = [&](unsigned VF) -> bool { +    if (VF == 1) +      return false; +    if (CM.isScalarAfterVectorization(I, VF) || +        CM.isProfitableToScalarize(I, VF)) +      return false; +    LoopVectorizationCostModel::InstWidening Decision = +        CM.getWideningDecision(I, VF); +    assert(Decision != LoopVectorizationCostModel::CM_Unknown && +           "CM decision should be taken at this point."); +    assert(Decision != LoopVectorizationCostModel::CM_Interleave && +           "Interleave memory opportunity should be caught earlier."); +    return Decision != LoopVectorizationCostModel::CM_Scalarize; +  }; + +  if (!getDecisionAndClampRange(willWiden, Range)) +    return nullptr; + +  VPValue *Mask = nullptr; +  if (Legal->isMaskRequired(I)) +    Mask = createBlockInMask(I->getParent(), Plan); + +  return new VPWidenMemoryInstructionRecipe(*I, Mask); +} + +VPWidenIntOrFpInductionRecipe * +LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, +                                                 VFRange &Range) { +  if (PHINode *Phi = dyn_cast<PHINode>(I)) { +    // Check if this is an integer or fp induction. If so, build the recipe that +    // produces its scalar and vector values. +    InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); +    if (II.getKind() == InductionDescriptor::IK_IntInduction || +        II.getKind() == InductionDescriptor::IK_FpInduction) +      return new VPWidenIntOrFpInductionRecipe(Phi); + +    return nullptr; +  } + +  // Optimize the special case where the source is a constant integer +  // induction variable. Notice that we can only optimize the 'trunc' case +  // because (a) FP conversions lose precision, (b) sext/zext may wrap, and +  // (c) other casts depend on pointer size. + +  // Determine whether \p K is a truncation based on an induction variable that +  // can be optimized. +  auto isOptimizableIVTruncate = +      [&](Instruction *K) -> std::function<bool(unsigned)> { +    return +        [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; +  }; + +  if (isa<TruncInst>(I) && +      getDecisionAndClampRange(isOptimizableIVTruncate(I), Range)) +    return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), +                                             cast<TruncInst>(I)); +  return nullptr; +} + +VPBlendRecipe * +LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) { +  PHINode *Phi = dyn_cast<PHINode>(I); +  if (!Phi || Phi->getParent() == OrigLoop->getHeader()) +    return nullptr; + +  // We know that all PHIs in non-header blocks are converted into selects, so +  // we don't have to worry about the insertion order and we can just use the +  // builder. At this point we generate the predication tree. There may be +  // duplications since this is a simple recursive scan, but future +  // optimizations will clean it up. + +  SmallVector<VPValue *, 2> Masks; +  unsigned NumIncoming = Phi->getNumIncomingValues(); +  for (unsigned In = 0; In < NumIncoming; In++) { +    VPValue *EdgeMask = +      createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); +    assert((EdgeMask || NumIncoming == 1) && +           "Multiple predecessors with one having a full mask"); +    if (EdgeMask) +      Masks.push_back(EdgeMask); +  } +  return new VPBlendRecipe(Phi, Masks); +} + +bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB, +                                          VFRange &Range) { +  if (Legal->isScalarWithPredication(I)) +    return false; + +  auto IsVectorizableOpcode = [](unsigned Opcode) { +    switch (Opcode) { +    case Instruction::Add: +    case Instruction::And: +    case Instruction::AShr: +    case Instruction::BitCast: +    case Instruction::Br: +    case Instruction::Call: +    case Instruction::FAdd: +    case Instruction::FCmp: +    case Instruction::FDiv: +    case Instruction::FMul: +    case Instruction::FPExt: +    case Instruction::FPToSI: +    case Instruction::FPToUI: +    case Instruction::FPTrunc: +    case Instruction::FRem: +    case Instruction::FSub: +    case Instruction::GetElementPtr: +    case Instruction::ICmp: +    case Instruction::IntToPtr: +    case Instruction::Load: +    case Instruction::LShr: +    case Instruction::Mul: +    case Instruction::Or: +    case Instruction::PHI: +    case Instruction::PtrToInt: +    case Instruction::SDiv: +    case Instruction::Select: +    case Instruction::SExt: +    case Instruction::Shl: +    case Instruction::SIToFP: +    case Instruction::SRem: +    case Instruction::Store: +    case Instruction::Sub: +    case Instruction::Trunc: +    case Instruction::UDiv: +    case Instruction::UIToFP: +    case Instruction::URem: +    case Instruction::Xor: +    case Instruction::ZExt: +      return true; +    } +    return false; +  }; + +  if (!IsVectorizableOpcode(I->getOpcode())) +    return false; + +  if (CallInst *CI = dyn_cast<CallInst>(I)) { +    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); +    if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || +               ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) +      return false; +  } + +  auto willWiden = [&](unsigned VF) -> bool { +    if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || +                             CM.isProfitableToScalarize(I, VF))) +      return false; +    if (CallInst *CI = dyn_cast<CallInst>(I)) { +      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); +      // The following case may be scalarized depending on the VF. +      // The flag shows whether we use Intrinsic or a usual Call for vectorized +      // version of the instruction. +      // Is it beneficial to perform intrinsic call compared to lib call? +      bool NeedToScalarize; +      unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); +      bool UseVectorIntrinsic = +          ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; +      return UseVectorIntrinsic || !NeedToScalarize; +    } +    if (isa<LoadInst>(I) || isa<StoreInst>(I)) { +      assert(CM.getWideningDecision(I, VF) == +                 LoopVectorizationCostModel::CM_Scalarize && +             "Memory widening decisions should have been taken care by now"); +      return false; +    } +    return true; +  }; + +  if (!getDecisionAndClampRange(willWiden, Range)) +    return false; + +  // Success: widen this instruction. We optimize the common case where +  // consecutive instructions can be represented by a single recipe. +  if (!VPBB->empty()) { +    VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); +    if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) +      return true; +  } + +  VPBB->appendRecipe(new VPWidenRecipe(I)); +  return true; +} + +VPBasicBlock *LoopVectorizationPlanner::handleReplication( +    Instruction *I, VFRange &Range, VPBasicBlock *VPBB, +    DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, +    VPlanPtr &Plan) { +  bool IsUniform = getDecisionAndClampRange( +      [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, +      Range); + +  bool IsPredicated = Legal->isScalarWithPredication(I); +  auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); + +  // Find if I uses a predicated instruction. If so, it will use its scalar +  // value. Avoid hoisting the insert-element which packs the scalar value into +  // a vector value, as that happens iff all users use the vector value. +  for (auto &Op : I->operands()) +    if (auto *PredInst = dyn_cast<Instruction>(Op)) +      if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) +        PredInst2Recipe[PredInst]->setAlsoPack(false); + +  // Finalize the recipe for Instr, first if it is not predicated. +  if (!IsPredicated) { +    DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); +    VPBB->appendRecipe(Recipe); +    return VPBB; +  } +  DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); +  assert(VPBB->getSuccessors().empty() && +         "VPBB has successors when handling predicated replication."); +  // Record predicated instructions for above packing optimizations. +  PredInst2Recipe[I] = Recipe; +  VPBlockBase *Region = +    VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan)); +  return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock())); +} + +VPRegionBlock * +LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr, +                                                VPRecipeBase *PredRecipe, +                                                VPlanPtr &Plan) { +  // Instructions marked for predication are replicated and placed under an +  // if-then construct to prevent side-effects. + +  // Generate recipes to compute the block mask for this region. +  VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); + +  // Build the triangular if-then region. +  std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); +  assert(Instr->getParent() && "Predicated instruction not in any basic block"); +  auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); +  auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); +  auto *PHIRecipe = +      Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); +  auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); +  auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); +  VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); + +  // Note: first set Entry as region entry and then connect successors starting +  // from it in order, to propagate the "parent" of each VPBasicBlock. +  Entry->setTwoSuccessors(Pred, Exit); +  Pred->setOneSuccessor(Exit); + +  return Region; +} + +LoopVectorizationPlanner::VPlanPtr +LoopVectorizationPlanner::buildVPlan(VFRange &Range, +                                     const SmallPtrSetImpl<Value *> &NeedDef) { +  EdgeMaskCache.clear(); +  BlockMaskCache.clear(); +  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); +  DenseMap<Instruction *, Instruction *> SinkAfterInverse; + +  // Collect instructions from the original loop that will become trivially dead +  // in the vectorized loop. We don't need to vectorize these instructions. For +  // example, original induction update instructions can become dead because we +  // separately emit induction "steps" when generating code for the new loop. +  // Similarly, we create a new latch condition when setting up the structure +  // of the new loop, so the old one can become dead. +  SmallPtrSet<Instruction *, 4> DeadInstructions; +  collectTriviallyDeadInstructions(DeadInstructions); + +  // Hold a mapping from predicated instructions to their recipes, in order to +  // fix their AlsoPack behavior if a user is determined to replicate and use a +  // scalar instead of vector value. +  DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; + +  // Create a dummy pre-entry VPBasicBlock to start building the VPlan. +  VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); +  auto Plan = llvm::make_unique<VPlan>(VPBB); + +  // Represent values that will have defs inside VPlan. +  for (Value *V : NeedDef) +    Plan->addVPValue(V); + +  // Scan the body of the loop in a topological order to visit each basic block +  // after having visited its predecessor basic blocks. +  LoopBlocksDFS DFS(OrigLoop); +  DFS.perform(LI); + +  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { +    // Relevant instructions from basic block BB will be grouped into VPRecipe +    // ingredients and fill a new VPBasicBlock. +    unsigned VPBBsForBB = 0; +    auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); +    VPBB->setOneSuccessor(FirstVPBBForBB); +    VPBB = FirstVPBBForBB; +    Builder.setInsertPoint(VPBB); + +    std::vector<Instruction *> Ingredients; + +    // Organize the ingredients to vectorize from current basic block in the +    // right order. +    for (Instruction &I : *BB) { +      Instruction *Instr = &I; + +      // First filter out irrelevant instructions, to ensure no recipes are +      // built for them. +      if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) || +          DeadInstructions.count(Instr)) +        continue; + +      // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct +      // member of the IG, do not construct any Recipe for it. +      const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr); +      if (IG && Instr != IG->getInsertPos() && +          Range.Start >= 2 && // Query is illegal for VF == 1 +          CM.getWideningDecision(Instr, Range.Start) == +              LoopVectorizationCostModel::CM_Interleave) { +        if (SinkAfterInverse.count(Instr)) +          Ingredients.push_back(SinkAfterInverse.find(Instr)->second); +        continue; +      } + +      // Move instructions to handle first-order recurrences, step 1: avoid +      // handling this instruction until after we've handled the instruction it +      // should follow. +      auto SAIt = SinkAfter.find(Instr); +      if (SAIt != SinkAfter.end()) { +        DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second +                     << " to vectorize a 1st order recurrence.\n"); +        SinkAfterInverse[SAIt->second] = Instr; +        continue; +      } + +      Ingredients.push_back(Instr); + +      // Move instructions to handle first-order recurrences, step 2: push the +      // instruction to be sunk at its insertion point. +      auto SAInvIt = SinkAfterInverse.find(Instr); +      if (SAInvIt != SinkAfterInverse.end()) +        Ingredients.push_back(SAInvIt->second); +    } + +    // Introduce each ingredient into VPlan. +    for (Instruction *Instr : Ingredients) { +      VPRecipeBase *Recipe = nullptr; + +      // Check if Instr should belong to an interleave memory recipe, or already +      // does. In the latter case Instr is irrelevant. +      if ((Recipe = tryToInterleaveMemory(Instr, Range))) { +        VPBB->appendRecipe(Recipe); +        continue; +      } + +      // Check if Instr is a memory operation that should be widened. +      if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { +        VPBB->appendRecipe(Recipe); +        continue; +      } + +      // Check if Instr should form some PHI recipe. +      if ((Recipe = tryToOptimizeInduction(Instr, Range))) { +        VPBB->appendRecipe(Recipe); +        continue; +      } +      if ((Recipe = tryToBlend(Instr, Plan))) { +        VPBB->appendRecipe(Recipe); +        continue; +      } +      if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { +        VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); +        continue; +      } + +      // Check if Instr is to be widened by a general VPWidenRecipe, after +      // having first checked for specific widening recipes that deal with +      // Interleave Groups, Inductions and Phi nodes. +      if (tryToWiden(Instr, VPBB, Range)) +        continue; + +      // Otherwise, if all widening options failed, Instruction is to be +      // replicated. This may create a successor for VPBB. +      VPBasicBlock *NextVPBB = +        handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan); +      if (NextVPBB != VPBB) { +        VPBB = NextVPBB; +        VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) +                                    : ""); +      } +    } +  } + +  // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks +  // may also be empty, such as the last one VPBB, reflecting original +  // basic-blocks with no recipes. +  VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); +  assert(PreEntry->empty() && "Expecting empty pre-entry block."); +  VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); +  PreEntry->disconnectSuccessor(Entry); +  delete PreEntry; + +  std::string PlanName; +  raw_string_ostream RSO(PlanName); +  unsigned VF = Range.Start; +  Plan->addVF(VF); +  RSO << "Initial VPlan for VF={" << VF; +  for (VF *= 2; VF < Range.End; VF *= 2) { +    Plan->addVF(VF); +    RSO << "," << VF; +  } +  RSO << "},UF>=1"; +  RSO.flush(); +  Plan->setName(PlanName); + +  return Plan; +} + +void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { +  O << " +\n" +    << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; +  IG->getInsertPos()->printAsOperand(O, false); +  O << "\\l\""; +  for (unsigned i = 0; i < IG->getFactor(); ++i) +    if (Instruction *I = IG->getMember(i)) +      O << " +\n" +        << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\""; +} + +void VPWidenRecipe::execute(VPTransformState &State) { +  for (auto &Instr : make_range(Begin, End)) +    State.ILV->widenInstruction(Instr); +} + +void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { +  assert(!State.Instance && "Int or FP induction being replicated."); +  State.ILV->widenIntOrFpInduction(IV, Trunc); +} + +void VPWidenPHIRecipe::execute(VPTransformState &State) { +  State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); +} + +void VPBlendRecipe::execute(VPTransformState &State) { +  State.ILV->setDebugLocFromInst(State.Builder, Phi); +  // We know that all PHIs in non-header blocks are converted into +  // selects, so we don't have to worry about the insertion order and we +  // can just use the builder. +  // At this point we generate the predication tree. There may be +  // duplications since this is a simple recursive scan, but future +  // optimizations will clean it up. + +  unsigned NumIncoming = Phi->getNumIncomingValues(); + +  assert((User || NumIncoming == 1) && +         "Multiple predecessors with predecessors having a full mask"); +  // Generate a sequence of selects of the form: +  // SELECT(Mask3, In3, +  //      SELECT(Mask2, In2, +  //                   ( ...))) +  InnerLoopVectorizer::VectorParts Entry(State.UF); +  for (unsigned In = 0; In < NumIncoming; ++In) { +    for (unsigned Part = 0; Part < State.UF; ++Part) { +      // We might have single edge PHIs (blocks) - use an identity +      // 'select' for the first PHI operand. +      Value *In0 = +          State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); +      if (In == 0) +        Entry[Part] = In0; // Initialize with the first incoming value. +      else { +        // Select between the current value and the previous incoming edge +        // based on the incoming mask. +        Value *Cond = State.get(User->getOperand(In), Part); +        Entry[Part] = +            State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); +      } +    } +  } +  for (unsigned Part = 0; Part < State.UF; ++Part) +    State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); +} + +void VPInterleaveRecipe::execute(VPTransformState &State) { +  assert(!State.Instance && "Interleave group being replicated."); +  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); +} + +void VPReplicateRecipe::execute(VPTransformState &State) { +  if (State.Instance) { // Generate a single instance. +    State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); +    // Insert scalar instance packing it into a vector. +    if (AlsoPack && State.VF > 1) { +      // If we're constructing lane 0, initialize to start from undef. +      if (State.Instance->Lane == 0) { +        Value *Undef = +            UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); +        State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); +      } +      State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); +    } +    return; +  } + +  // Generate scalar instances for all VF lanes of all UF parts, unless the +  // instruction is uniform inwhich case generate only the first lane for each +  // of the UF parts. +  unsigned EndLane = IsUniform ? 1 : State.VF; +  for (unsigned Part = 0; Part < State.UF; ++Part) +    for (unsigned Lane = 0; Lane < EndLane; ++Lane) +      State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); +} + +void VPBranchOnMaskRecipe::execute(VPTransformState &State) { +  assert(State.Instance && "Branch on Mask works only on single instance."); + +  unsigned Part = State.Instance->Part; +  unsigned Lane = State.Instance->Lane; + +  Value *ConditionBit = nullptr; +  if (!User) // Block in mask is all-one. +    ConditionBit = State.Builder.getTrue(); +  else { +    VPValue *BlockInMask = User->getOperand(0); +    ConditionBit = State.get(BlockInMask, Part); +    if (ConditionBit->getType()->isVectorTy()) +      ConditionBit = State.Builder.CreateExtractElement( +          ConditionBit, State.Builder.getInt32(Lane)); +  } + +  // Replace the temporary unreachable terminator with a new conditional branch, +  // whose two destinations will be set later when they are created. +  auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); +  assert(isa<UnreachableInst>(CurrentTerminator) && +         "Expected to replace unreachable terminator with conditional branch."); +  auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); +  CondBr->setSuccessor(0, nullptr); +  ReplaceInstWithInst(CurrentTerminator, CondBr); +} + +void VPPredInstPHIRecipe::execute(VPTransformState &State) { +  assert(State.Instance && "Predicated instruction PHI works per instance."); +  Instruction *ScalarPredInst = cast<Instruction>( +      State.ValueMap.getScalarValue(PredInst, *State.Instance)); +  BasicBlock *PredicatedBB = ScalarPredInst->getParent(); +  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); +  assert(PredicatingBB && "Predicated block has no single predecessor."); + +  // By current pack/unpack logic we need to generate only a single phi node: if +  // a vector value for the predicated instruction exists at this point it means +  // the instruction has vector users only, and a phi for the vector value is +  // needed. In this case the recipe of the predicated instruction is marked to +  // also do that packing, thereby "hoisting" the insert-element sequence. +  // Otherwise, a phi node for the scalar value is needed. +  unsigned Part = State.Instance->Part; +  if (State.ValueMap.hasVectorValue(PredInst, Part)) { +    Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); +    InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); +    PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); +    VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. +    VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. +    State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. +  } else { +    Type *PredInstType = PredInst->getType(); +    PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); +    Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); +    Phi->addIncoming(ScalarPredInst, PredicatedBB); +    State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); +  } +} + +void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { +  if (!User) +    return State.ILV->vectorizeMemoryInstruction(&Instr); + +  // Last (and currently only) operand is a mask. +  InnerLoopVectorizer::VectorParts MaskValues(State.UF); +  VPValue *Mask = User->getOperand(User->getNumOperands() - 1); +  for (unsigned Part = 0; Part < State.UF; ++Part) +    MaskValues[Part] = State.get(Mask, Part); +  State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); +} +  bool LoopVectorizePass::processLoop(Loop *L) {    assert(L->empty() && "Only process inner loops."); @@ -7878,7 +8601,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {    CM.collectValuesToIgnore();    // Use the planner for vectorization. -  LoopVectorizationPlanner LVP(L, LI, &LVL, CM); +  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);    // Get user vectorization factor.    unsigned UserVF = Hints.getWidth(); @@ -7941,48 +8664,61 @@ bool LoopVectorizePass::processLoop(Loop *L) {    const char *VAPassName = Hints.vectorizeAnalysisPassName();    if (!VectorizeLoop && !InterleaveLoop) {      // Do not vectorize or interleaving the loop. -    ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, -                                         L->getStartLoc(), L->getHeader()) -              << VecDiagMsg.second); -    ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, -                                         L->getStartLoc(), L->getHeader()) -              << IntDiagMsg.second); +    ORE->emit([&]() { +      return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, +                                      L->getStartLoc(), L->getHeader()) +             << VecDiagMsg.second; +    }); +    ORE->emit([&]() { +      return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, +                                      L->getStartLoc(), L->getHeader()) +             << IntDiagMsg.second; +    });      return false;    } else if (!VectorizeLoop && InterleaveLoop) {      DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); -    ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, -                                         L->getStartLoc(), L->getHeader()) -              << VecDiagMsg.second); +    ORE->emit([&]() { +      return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, +                                        L->getStartLoc(), L->getHeader()) +             << VecDiagMsg.second; +    });    } else if (VectorizeLoop && !InterleaveLoop) {      DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "                   << DebugLocStr << '\n'); -    ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, -                                         L->getStartLoc(), L->getHeader()) -              << IntDiagMsg.second); +    ORE->emit([&]() { +      return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, +                                        L->getStartLoc(), L->getHeader()) +             << IntDiagMsg.second; +    });    } else if (VectorizeLoop && InterleaveLoop) {      DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "                   << DebugLocStr << '\n');      DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');    } +  LVP.setBestPlan(VF.Width, IC); +    using namespace ore; +    if (!VectorizeLoop) {      assert(IC > 1 && "interleave count should not be 1 or 0");      // If we decided that it is not legal to vectorize the loop, then      // interleave it.      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,                                 &CM); -    LVP.executePlan(Unroller); +    LVP.executePlan(Unroller, DT); -    ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), -                                 L->getHeader()) -              << "interleaved loop (interleaved count: " -              << NV("InterleaveCount", IC) << ")"); +    ORE->emit([&]() { +      return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), +                                L->getHeader()) +             << "interleaved loop (interleaved count: " +             << NV("InterleaveCount", IC) << ")"; +    });    } else {      // If we decided that it is *legal* to vectorize the loop, then do it.      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,                             &LVL, &CM); -    LVP.executePlan(LB); +    LVP.executePlan(LB, DT);      ++LoopsVectorized;      // Add metadata to disable runtime unrolling a scalar loop when there are @@ -7992,11 +8728,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {        AddRuntimeUnrollDisableMetaData(L);      // Report the vectorization decision. -    ORE->emit(OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), -                                 L->getHeader()) -              << "vectorized loop (vectorization width: " -              << NV("VectorizationFactor", VF.Width) -              << ", interleaved count: " << NV("InterleaveCount", IC) << ")"); +    ORE->emit([&]() { +      return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), +                                L->getHeader()) +             << "vectorized loop (vectorization width: " +             << NV("VectorizationFactor", VF.Width) +             << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; +    });    }    // Mark the loop as already vectorized to avoid vectorizing again. @@ -8012,7 +8750,6 @@ bool LoopVectorizePass::runImpl(      DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,      std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,      OptimizationRemarkEmitter &ORE_) { -    SE = &SE_;    LI = &LI_;    TTI = &TTI_; @@ -8068,10 +8805,8 @@ bool LoopVectorizePass::runImpl(    // Process each loop nest in the function.    return Changed; -  } -  PreservedAnalyses LoopVectorizePass::run(Function &F,                                           FunctionAnalysisManager &AM) {      auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); @@ -8088,7 +8823,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,      auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();      std::function<const LoopAccessInfo &(Loop &)> GetLAA =          [&](Loop &L) -> const LoopAccessInfo & { -      LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI}; +      LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};        return LAM.getResult<LoopAccessAnalysis>(L, AR);      };      bool Changed = diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index dcbcab459a6b..76ba62f5d596 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6,6 +6,7 @@  // License. See LICENSE.TXT for details.  //  //===----------------------------------------------------------------------===// +//  // This pass implements the Bottom Up SLP vectorizer. It detects consecutive  // stores that can be put together into vector-stores. Next, it attempts to  // construct vectorizable tree using the use-def chains. If a profitable tree @@ -15,39 +16,89 @@  //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.  //  //===----------------------------------------------------------------------===// +  #include "llvm/Transforms/Vectorize/SLPVectorizer.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/None.h"  #include "llvm/ADT/Optional.h"  #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/iterator.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DemandedBits.h"  #include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h"  #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h"  #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h"  #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h"  #include "llvm/IR/Module.h"  #include "llvm/IR/NoFolder.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h"  #include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h"  #include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h"  #include "llvm/IR/Verifier.h"  #include "llvm/Pass.h" +#include "llvm/Support/Casting.h"  #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/DOTGraphTraits.h"  #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/GraphWriter.h"  #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Transforms/Utils/LoopUtils.h"  #include "llvm/Transforms/Vectorize.h"  #include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator>  #include <memory> +#include <set> +#include <string> +#include <tuple> +#include <utility> +#include <vector>  using namespace llvm; +using namespace llvm::PatternMatch;  using namespace slpvectorizer;  #define SV_NAME "slp-vectorizer" @@ -156,6 +207,119 @@ static bool isSplat(ArrayRef<Value *> VL) {    return true;  } +/// Checks if the vector of instructions can be represented as a shuffle, like: +/// %x0 = extractelement <4 x i8> %x, i32 0 +/// %x3 = extractelement <4 x i8> %x, i32 3 +/// %y1 = extractelement <4 x i8> %y, i32 1 +/// %y2 = extractelement <4 x i8> %y, i32 2 +/// %x0x0 = mul i8 %x0, %x0 +/// %x3x3 = mul i8 %x3, %x3 +/// %y1y1 = mul i8 %y1, %y1 +/// %y2y2 = mul i8 %y2, %y2 +/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0 +/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 +/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 +/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 +/// ret <4 x i8> %ins4 +/// can be transformed into: +/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, +///                                                         i32 6> +/// %2 = mul <4 x i8> %1, %1 +/// ret <4 x i8> %2 +/// We convert this initially to something like: +/// %x0 = extractelement <4 x i8> %x, i32 0 +/// %x3 = extractelement <4 x i8> %x, i32 3 +/// %y1 = extractelement <4 x i8> %y, i32 1 +/// %y2 = extractelement <4 x i8> %y, i32 2 +/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0 +/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 +/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 +/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 +/// %5 = mul <4 x i8> %4, %4 +/// %6 = extractelement <4 x i8> %5, i32 0 +/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0 +/// %7 = extractelement <4 x i8> %5, i32 1 +/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 +/// %8 = extractelement <4 x i8> %5, i32 2 +/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 +/// %9 = extractelement <4 x i8> %5, i32 3 +/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 +/// ret <4 x i8> %ins4 +/// InstCombiner transforms this into a shuffle and vector mul +static Optional<TargetTransformInfo::ShuffleKind> +isShuffle(ArrayRef<Value *> VL) { +  auto *EI0 = cast<ExtractElementInst>(VL[0]); +  unsigned Size = EI0->getVectorOperandType()->getVectorNumElements(); +  Value *Vec1 = nullptr; +  Value *Vec2 = nullptr; +  enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute}; +  ShuffleMode CommonShuffleMode = Unknown; +  for (unsigned I = 0, E = VL.size(); I < E; ++I) { +    auto *EI = cast<ExtractElementInst>(VL[I]); +    auto *Vec = EI->getVectorOperand(); +    // All vector operands must have the same number of vector elements. +    if (Vec->getType()->getVectorNumElements() != Size) +      return None; +    auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); +    if (!Idx) +      return None; +    // Undefined behavior if Idx is negative or >= Size. +    if (Idx->getValue().uge(Size)) +      continue; +    unsigned IntIdx = Idx->getValue().getZExtValue(); +    // We can extractelement from undef vector. +    if (isa<UndefValue>(Vec)) +      continue; +    // For correct shuffling we have to have at most 2 different vector operands +    // in all extractelement instructions. +    if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2) +      return None; +    if (CommonShuffleMode == Permute) +      continue; +    // If the extract index is not the same as the operation number, it is a +    // permutation. +    if (IntIdx != I) { +      CommonShuffleMode = Permute; +      continue; +    } +    // Check the shuffle mode for the current operation. +    if (!Vec1) +      Vec1 = Vec; +    else if (Vec != Vec1) +      Vec2 = Vec; +    // Example: shufflevector A, B, <0,5,2,7> +    // I is odd and IntIdx for A == I - FirstAlternate shuffle. +    // I is even and IntIdx for B == I - FirstAlternate shuffle. +    // Example: shufflevector A, B, <4,1,6,3> +    // I is even and IntIdx for A == I - SecondAlternate shuffle. +    // I is odd and IntIdx for B == I - SecondAlternate shuffle. +    const bool IIsEven = I & 1; +    const bool CurrVecIsA = Vec == Vec1; +    const bool IIsOdd = !IIsEven; +    const bool CurrVecIsB = !CurrVecIsA; +    ShuffleMode CurrentShuffleMode = +        ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate +                                                            : SecondAlternate; +    // Common mode is not set or the same as the shuffle mode of the current +    // operation - alternate. +    if (CommonShuffleMode == Unknown) +      CommonShuffleMode = CurrentShuffleMode; +    // Common shuffle mode is not the same as the shuffle mode of the current +    // operation - permutation. +    if (CommonShuffleMode != CurrentShuffleMode) +      CommonShuffleMode = Permute; +  } +  // If we're not crossing lanes in different vectors, consider it as blending. +  if ((CommonShuffleMode == FirstAlternate || +       CommonShuffleMode == SecondAlternate) && +      Vec2) +    return TargetTransformInfo::SK_Alternate; +  // If Vec2 was never used, we have a permutation of a single vector, otherwise +  // we have permutation of 2 vectors. +  return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc +              : TargetTransformInfo::SK_PermuteSingleSrc; +} +  ///\returns Opcode that can be clubbed with \p Op to create an alternate  /// sequence which can later be merged as a ShuffleVector instruction.  static unsigned getAltOpcode(unsigned Op) { @@ -173,50 +337,107 @@ static unsigned getAltOpcode(unsigned Op) {    }  } -/// true if the \p Value is odd, false otherwise.  static bool isOdd(unsigned Value) {    return Value & 1;  } -///\returns bool representing if Opcode \p Op can be part -/// of an alternate sequence which can later be merged as -/// a ShuffleVector instruction. -static bool canCombineAsAltInst(unsigned Op) { -  return Op == Instruction::FAdd || Op == Instruction::FSub || -         Op == Instruction::Sub || Op == Instruction::Add; +static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode, +                            unsigned CheckedOpcode) { +  return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;  } -/// \returns ShuffleVector instruction if instructions in \p VL have -///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. -/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) -static unsigned isAltInst(ArrayRef<Value *> VL) { -  Instruction *I0 = dyn_cast<Instruction>(VL[0]); -  unsigned Opcode = I0->getOpcode(); -  unsigned AltOpcode = getAltOpcode(Opcode); -  for (int i = 1, e = VL.size(); i < e; i++) { -    Instruction *I = dyn_cast<Instruction>(VL[i]); -    if (!I || I->getOpcode() != (isOdd(i) ? AltOpcode : Opcode)) -      return 0; -  } -  return Instruction::ShuffleVector; +/// Chooses the correct key for scheduling data. If \p Op has the same (or +/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p +/// OpValue. +static Value *isOneOf(Value *OpValue, Value *Op) { +  auto *I = dyn_cast<Instruction>(Op); +  if (!I) +    return OpValue; +  auto *OpInst = cast<Instruction>(OpValue); +  unsigned OpInstOpcode = OpInst->getOpcode(); +  unsigned IOpcode = I->getOpcode(); +  if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode)) +    return Op; +  return OpValue;  } -/// \returns The opcode if all of the Instructions in \p VL have the same -/// opcode, or zero. -static unsigned getSameOpcode(ArrayRef<Value *> VL) { -  Instruction *I0 = dyn_cast<Instruction>(VL[0]); +namespace { + +/// Contains data for the instructions going to be vectorized. +struct RawInstructionsData { +  /// Main Opcode of the instructions going to be vectorized. +  unsigned Opcode = 0; + +  /// The list of instructions have some instructions with alternate opcodes. +  bool HasAltOpcodes = false; +}; + +} // end anonymous namespace + +/// Checks the list of the vectorized instructions \p VL and returns info about +/// this list. +static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) { +  auto *I0 = dyn_cast<Instruction>(VL[0]);    if (!I0) -    return 0; +    return {}; +  RawInstructionsData Res;    unsigned Opcode = I0->getOpcode(); -  for (int i = 1, e = VL.size(); i < e; i++) { -    Instruction *I = dyn_cast<Instruction>(VL[i]); -    if (!I || Opcode != I->getOpcode()) { -      if (canCombineAsAltInst(Opcode) && i == 1) -        return isAltInst(VL); -      return 0; +  // Walk through the list of the vectorized instructions +  // in order to check its structure described by RawInstructionsData. +  for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) { +    auto *I = dyn_cast<Instruction>(VL[Cnt]); +    if (!I) +      return {}; +    if (Opcode != I->getOpcode()) +      Res.HasAltOpcodes = true; +  } +  Res.Opcode = Opcode; +  return Res; +} + +namespace { + +/// Main data required for vectorization of instructions. +struct InstructionsState { +  /// The very first instruction in the list with the main opcode. +  Value *OpValue = nullptr; + +  /// The main opcode for the list of instructions. +  unsigned Opcode = 0; + +  /// Some of the instructions in the list have alternate opcodes. +  bool IsAltShuffle = false; + +  InstructionsState() = default; +  InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle) +      : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {} +}; + +} // end anonymous namespace + +/// \returns analysis of the Instructions in \p VL described in +/// InstructionsState, the Opcode that we suppose the whole list  +/// could be vectorized even if its structure is diverse. +static InstructionsState getSameOpcode(ArrayRef<Value *> VL) { +  auto Res = getMainOpcode(VL); +  unsigned Opcode = Res.Opcode; +  if (!Res.HasAltOpcodes) +    return InstructionsState(VL[0], Opcode, false); +  auto *OpInst = cast<Instruction>(VL[0]); +  unsigned AltOpcode = getAltOpcode(Opcode); +  // Examine each element in the list instructions VL to determine +  // if some operations there could be considered as an alternative +  // (for example as subtraction relates to addition operation). +  for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { +    auto *I = cast<Instruction>(VL[Cnt]); +    unsigned InstOpcode = I->getOpcode(); +    if ((Res.HasAltOpcodes && +         InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) || +        (!Res.HasAltOpcodes && InstOpcode != Opcode)) { +      return InstructionsState(OpInst, 0, false);      }    } -  return Opcode; +  return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);  }  /// \returns true if all of the values in \p VL have the same type or false @@ -247,7 +468,6 @@ static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {  /// possible scalar operand in vectorized instruction.  static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,                                      TargetLibraryInfo *TLI) { -    unsigned Opcode = UserInst->getOpcode();    switch (Opcode) {    case Instruction::Load: { @@ -292,24 +512,25 @@ static bool isSimple(Instruction *I) {  }  namespace llvm { +  namespace slpvectorizer { +  /// Bottom Up SLP Vectorizer.  class BoUpSLP {  public: -  typedef SmallVector<Value *, 8> ValueList; -  typedef SmallVector<Instruction *, 16> InstrList; -  typedef SmallPtrSet<Value *, 16> ValueSet; -  typedef SmallVector<StoreInst *, 8> StoreList; -  typedef MapVector<Value *, SmallVector<Instruction *, 2>> -      ExtraValueToDebugLocsMap; +  using ValueList = SmallVector<Value *, 8>; +  using InstrList = SmallVector<Instruction *, 16>; +  using ValueSet = SmallPtrSet<Value *, 16>; +  using StoreList = SmallVector<StoreInst *, 8>; +  using ExtraValueToDebugLocsMap = +      MapVector<Value *, SmallVector<Instruction *, 2>>;    BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,            TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,            DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,            const DataLayout *DL, OptimizationRemarkEmitter *ORE) -      : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), -        SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), -        DL(DL), ORE(ORE), Builder(Se->getContext()) { +      : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), +        DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {      CodeMetrics::collectEphemeralValues(F, AC, EphValues);      // Use the vector register size specified by the target unless overridden      // by a command-line option. @@ -331,6 +552,7 @@ public:    /// \brief Vectorize the tree that starts with the elements in \p VL.    /// Returns the vectorized root.    Value *vectorizeTree(); +    /// Vectorize the tree but with the list of externally used values \p    /// ExternallyUsedValues. Values in this MapVector can be replaced but the    /// generated extractvalue instructions. @@ -348,6 +570,7 @@ public:    /// the purpose of scheduling and extraction in the \p UserIgnoreLst.    void buildTree(ArrayRef<Value *> Roots,                   ArrayRef<Value *> UserIgnoreLst = None); +    /// Construct a vectorizable tree that starts at \p Roots, ignoring users for    /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking    /// into account (anf updating it, if required) list of externally used @@ -374,7 +597,7 @@ public:    unsigned getTreeSize() const { return VectorizableTree.size(); }    /// \brief Perform LICM and CSE on the newly generated gather sequences. -  void optimizeGatherSequence(); +  void optimizeGatherSequence(Function &F);    /// \returns true if it is beneficial to reverse the vector order.    bool shouldReorder() const { @@ -416,21 +639,30 @@ public:  private:    struct TreeEntry; +  /// Checks if all users of \p I are the part of the vectorization tree. +  bool areAllUsersVectorized(Instruction *I) const; +    /// \returns the cost of the vectorizable entry.    int getEntryCost(TreeEntry *E);    /// This is the recursive part of buildTree. -  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int); +  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int UserIndx = -1, +                     int OpdNum = 0);    /// \returns True if the ExtractElement/ExtractValue instructions in VL can    /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). -  bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const; +  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const; -  /// Vectorize a single entry in the tree. -  Value *vectorizeTree(TreeEntry *E); +  /// Vectorize a single entry in the tree.\p OpdNum indicate the ordinality of +  /// operand corrsponding to this tree entry \p E for the user tree entry +  /// indicated by \p UserIndx. +  //  In other words, "E == TreeEntry[UserIndx].getOperand(OpdNum)". +  Value *vectorizeTree(TreeEntry *E, int OpdNum = 0, int UserIndx = -1); -  /// Vectorize a single entry in the tree, starting in \p VL. -  Value *vectorizeTree(ArrayRef<Value *> VL); +  /// Vectorize a single entry in the tree, starting in \p VL.\p OpdNum indicate +  /// the ordinality of operand corrsponding to the \p VL of scalar values for the +  /// user indicated by \p UserIndx this \p VL feeds into. +  Value *vectorizeTree(ArrayRef<Value *> VL, int OpdNum = 0, int UserIndx = -1);    /// \returns the pointer to the vectorized value if \p VL is already    /// vectorized, or NULL. They may happen in cycles. @@ -447,7 +679,7 @@ private:    /// \brief Set the Builder insert point to one after the last instruction in    /// the bundle -  void setInsertPointAfterBundle(ArrayRef<Value *> VL); +  void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue);    /// \returns a vector from a collection of scalars in \p VL.    Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); @@ -458,18 +690,17 @@ private:    /// \reorder commutative operands in alt shuffle if they result in    ///  vectorized code. -  void reorderAltShuffleOperands(ArrayRef<Value *> VL, +  void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,                                   SmallVectorImpl<Value *> &Left,                                   SmallVectorImpl<Value *> &Right); +    /// \reorder commutative operands to get better probability of    /// generating vectorized code. -  void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, +  void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,                                        SmallVectorImpl<Value *> &Left,                                        SmallVectorImpl<Value *> &Right);    struct TreeEntry { -    TreeEntry(std::vector<TreeEntry> &Container) -        : Scalars(), VectorizedValue(nullptr), NeedToGather(0), -          Container(Container) {} +    TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}      /// \returns true if the scalars in VL are equal to this entry.      bool isSame(ArrayRef<Value *> VL) const { @@ -477,14 +708,32 @@ private:        return std::equal(VL.begin(), VL.end(), Scalars.begin());      } +    /// \returns true if the scalars in VL are found in this tree entry. +    bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL, +        ScalarEvolution &SE) const { +      assert(VL.size() == Scalars.size() && "Invalid size"); +      SmallVector<Value *, 8> List; +      if (!sortLoadAccesses(VL, DL, SE, List)) +        return false; +      return std::equal(List.begin(), List.end(), Scalars.begin()); +    } +      /// A vector of scalars.      ValueList Scalars;      /// The Scalars are vectorized into this value. It is initialized to Null. -    Value *VectorizedValue; +    Value *VectorizedValue = nullptr;      /// Do we need to gather this sequence ? -    bool NeedToGather; +    bool NeedToGather = false; + +    /// Records optional shuffle mask for the uses of jumbled memory accesses. +    /// For example, a non-empty ShuffleMask[1] represents the permutation of +    /// lanes that operand #1 of this vectorized instruction should undergo +    /// before feeding this vectorized instruction, whereas an empty +    /// ShuffleMask[0] indicates that the lanes of operand #0 of this vectorized +    /// instruction need not be permuted at all. +    SmallVector<SmallVector<unsigned, 4>, 2> ShuffleMask;      /// Points back to the VectorizableTree.      /// @@ -501,12 +750,31 @@ private:    /// Create a new VectorizableTree entry.    TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, -                          int &UserTreeIdx) { +                          int &UserTreeIdx, const InstructionsState &S, +                          ArrayRef<unsigned> ShuffleMask = None, +                          int OpdNum = 0) { +    assert((!Vectorized || S.Opcode != 0) && +           "Vectorized TreeEntry without opcode");      VectorizableTree.emplace_back(VectorizableTree); +      int idx = VectorizableTree.size() - 1;      TreeEntry *Last = &VectorizableTree[idx];      Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());      Last->NeedToGather = !Vectorized; + +    TreeEntry *UserTreeEntry = nullptr; +    if (UserTreeIdx != -1) +      UserTreeEntry = &VectorizableTree[UserTreeIdx]; + +    if (UserTreeEntry && !ShuffleMask.empty()) { +      if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size()) +        UserTreeEntry->ShuffleMask.resize(OpdNum + 1); +      assert(UserTreeEntry->ShuffleMask[OpdNum].empty() && +             "Mask already present"); +      using mask = SmallVector<unsigned, 4>; +      mask tempMask(ShuffleMask.begin(), ShuffleMask.end()); +      UserTreeEntry->ShuffleMask[OpdNum] = tempMask; +    }      if (Vectorized) {        for (int i = 0, e = VL.size(); i != e; ++i) {          assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); @@ -548,16 +816,19 @@ private:    /// This POD struct describes one external user in the vectorized tree.    struct ExternalUser { -    ExternalUser (Value *S, llvm::User *U, int L) : -      Scalar(S), User(U), Lane(L){} +    ExternalUser(Value *S, llvm::User *U, int L) +        : Scalar(S), User(U), Lane(L) {} +      // Which scalar in our function.      Value *Scalar; +      // Which user that uses the scalar.      llvm::User *User; +      // Which lane does the scalar belong to.      int Lane;    }; -  typedef SmallVector<ExternalUser, 16> UserList; +  using UserList = SmallVector<ExternalUser, 16>;    /// Checks if two instructions may access the same memory.    /// @@ -565,7 +836,6 @@ private:    /// is invariant in the calling loop.    bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,                   Instruction *Inst2) { -      // First check if the result is already in the cache.      AliasCacheKey key = std::make_pair(Inst1, Inst2);      Optional<bool> &result = AliasCache[key]; @@ -583,7 +853,7 @@ private:      return aliased;    } -  typedef std::pair<Instruction *, Instruction *> AliasCacheKey; +  using AliasCacheKey = std::pair<Instruction *, Instruction *>;    /// Cache for alias results.    /// TODO: consider moving this to the AliasAnalysis itself. @@ -616,6 +886,7 @@ private:    /// Holds all of the instructions that we gathered.    SetVector<Instruction *> GatherSeq; +    /// A list of blocks that we are going to CSE.    SetVector<BasicBlock *> CSEBlocks; @@ -624,18 +895,13 @@ private:    /// instruction bundle (= a group of instructions which is combined into a    /// vector instruction).    struct ScheduleData { -      // The initial value for the dependency counters. It means that the      // dependencies are not calculated yet.      enum { InvalidDeps = -1 }; -    ScheduleData() -        : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), -          NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), -          Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), -          UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} +    ScheduleData() = default; -    void init(int BlockSchedulingRegionID) { +    void init(int BlockSchedulingRegionID, Value *OpVal) {        FirstInBundle = this;        NextInBundle = nullptr;        NextLoadStore = nullptr; @@ -643,6 +909,7 @@ private:        SchedulingRegionID = BlockSchedulingRegionID;        UnscheduledDepsInBundle = UnscheduledDeps;        clearDependencies(); +      OpValue = OpVal;      }      /// Returns true if the dependency information has been calculated. @@ -702,19 +969,19 @@ private:        }      } -    Instruction *Inst; +    Instruction *Inst = nullptr;      /// Points to the head in an instruction bundle (and always to this for      /// single instructions). -    ScheduleData *FirstInBundle; +    ScheduleData *FirstInBundle = nullptr;      /// Single linked list of all instructions in a bundle. Null if it is a      /// single instruction. -    ScheduleData *NextInBundle; +    ScheduleData *NextInBundle = nullptr;      /// Single linked list of all memory instructions (e.g. load, store, call)      /// in the block - until the end of the scheduling region. -    ScheduleData *NextLoadStore; +    ScheduleData *NextLoadStore = nullptr;      /// The dependent memory instructions.      /// This list is derived on demand in calculateDependencies(). @@ -722,31 +989,33 @@ private:      /// This ScheduleData is in the current scheduling region if this matches      /// the current SchedulingRegionID of BlockScheduling. -    int SchedulingRegionID; +    int SchedulingRegionID = 0;      /// Used for getting a "good" final ordering of instructions. -    int SchedulingPriority; +    int SchedulingPriority = 0;      /// The number of dependencies. Constitutes of the number of users of the      /// instruction plus the number of dependent memory instructions (if any).      /// This value is calculated on demand.      /// If InvalidDeps, the number of dependencies is not calculated yet. -    /// -    int Dependencies; +    int Dependencies = InvalidDeps;      /// The number of dependencies minus the number of dependencies of scheduled      /// instructions. As soon as this is zero, the instruction/bundle gets ready      /// for scheduling.      /// Note that this is negative as long as Dependencies is not calculated. -    int UnscheduledDeps; +    int UnscheduledDeps = InvalidDeps;      /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for      /// single instructions. -    int UnscheduledDepsInBundle; +    int UnscheduledDepsInBundle = InvalidDeps;      /// True if this instruction is scheduled (or considered as scheduled in the      /// dry-run). -    bool IsScheduled; +    bool IsScheduled = false; + +    /// Opcode of the current instruction in the schedule data. +    Value *OpValue = nullptr;    };  #ifndef NDEBUG @@ -756,22 +1025,14 @@ private:      return os;    }  #endif +    friend struct GraphTraits<BoUpSLP *>;    friend struct DOTGraphTraits<BoUpSLP *>;    /// Contains all scheduling data for a basic block. -  ///    struct BlockScheduling { -      BlockScheduling(BasicBlock *BB) -        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), -          ScheduleStart(nullptr), ScheduleEnd(nullptr), -          FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), -          ScheduleRegionSize(0), -          ScheduleRegionSizeLimit(ScheduleRegionSizeBudget), -          // Make sure that the initial SchedulingRegionID is greater than the -          // initial SchedulingRegionID in ScheduleData (which is 0). -          SchedulingRegionID(1) {} +        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}      void clear() {        ReadyInsts.clear(); @@ -799,6 +1060,18 @@ private:        return nullptr;      } +    ScheduleData *getScheduleData(Value *V, Value *Key) { +      if (V == Key) +        return getScheduleData(V); +      auto I = ExtraScheduleDataMap.find(V); +      if (I != ExtraScheduleDataMap.end()) { +        ScheduleData *SD = I->second[Key]; +        if (SD && SD->SchedulingRegionID == SchedulingRegionID) +          return SD; +      } +      return nullptr; +    } +      bool isInSchedulingRegion(ScheduleData *SD) {        return SD->SchedulingRegionID == SchedulingRegionID;      } @@ -812,19 +1085,29 @@ private:        ScheduleData *BundleMember = SD;        while (BundleMember) { +        if (BundleMember->Inst != BundleMember->OpValue) { +          BundleMember = BundleMember->NextInBundle; +          continue; +        }          // Handle the def-use chain dependencies.          for (Use &U : BundleMember->Inst->operands()) { -          ScheduleData *OpDef = getScheduleData(U.get()); -          if (OpDef && OpDef->hasValidDependencies() && -              OpDef->incrementUnscheduledDeps(-1) == 0) { -            // There are no more unscheduled dependencies after decrementing, -            // so we can put the dependent instruction into the ready list. -            ScheduleData *DepBundle = OpDef->FirstInBundle; -            assert(!DepBundle->IsScheduled && -                   "already scheduled bundle gets ready"); -            ReadyList.insert(DepBundle); -            DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n"); -          } +          auto *I = dyn_cast<Instruction>(U.get()); +          if (!I) +            continue; +          doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { +            if (OpDef && OpDef->hasValidDependencies() && +                OpDef->incrementUnscheduledDeps(-1) == 0) { +              // There are no more unscheduled dependencies after +              // decrementing, so we can put the dependent instruction +              // into the ready list. +              ScheduleData *DepBundle = OpDef->FirstInBundle; +              assert(!DepBundle->IsScheduled && +                     "already scheduled bundle gets ready"); +              ReadyList.insert(DepBundle); +              DEBUG(dbgs() +                    << "SLP:    gets ready (def): " << *DepBundle << "\n"); +            } +          });          }          // Handle the memory dependencies.          for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { @@ -835,22 +1118,35 @@ private:              assert(!DepBundle->IsScheduled &&                     "already scheduled bundle gets ready");              ReadyList.insert(DepBundle); -            DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n"); +            DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle +                         << "\n");            }          }          BundleMember = BundleMember->NextInBundle;        }      } +    void doForAllOpcodes(Value *V, +                         function_ref<void(ScheduleData *SD)> Action) { +      if (ScheduleData *SD = getScheduleData(V)) +        Action(SD); +      auto I = ExtraScheduleDataMap.find(V); +      if (I != ExtraScheduleDataMap.end()) +        for (auto &P : I->second) +          if (P.second->SchedulingRegionID == SchedulingRegionID) +            Action(P.second); +    } +      /// Put all instructions into the ReadyList which are ready for scheduling.      template <typename ReadyListType>      void initialFillReadyList(ReadyListType &ReadyList) {        for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { -        ScheduleData *SD = getScheduleData(I); -        if (SD->isSchedulingEntity() && SD->isReady()) { -          ReadyList.insert(SD); -          DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n"); -        } +        doForAllOpcodes(I, [&](ScheduleData *SD) { +          if (SD->isSchedulingEntity() && SD->isReady()) { +            ReadyList.insert(SD); +            DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n"); +          } +        });        }      } @@ -862,9 +1158,12 @@ private:      /// Un-bundles a group of instructions.      void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); +    /// Allocates schedule data chunk. +    ScheduleData *allocateScheduleDataChunks(); +      /// Extends the scheduling region so that V is inside the region.      /// \returns true if the region size is within the limit. -    bool extendSchedulingRegion(Value *V); +    bool extendSchedulingRegion(Value *V, Value *OpValue);      /// Initialize the ScheduleData structures for new instructions in the      /// scheduling region. @@ -897,6 +1196,10 @@ private:      /// ScheduleData structures are recycled.      DenseMap<Value *, ScheduleData *> ScheduleDataMap; +    /// Attaches ScheduleData to Instruction with the leading key. +    DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> +        ExtraScheduleDataMap; +      struct ReadyList : SmallVector<ScheduleData *, 8> {        void insert(ScheduleData *SD) { push_back(SD); }      }; @@ -905,28 +1208,30 @@ private:      ReadyList ReadyInsts;      /// The first instruction of the scheduling region. -    Instruction *ScheduleStart; +    Instruction *ScheduleStart = nullptr;      /// The first instruction _after_ the scheduling region. -    Instruction *ScheduleEnd; +    Instruction *ScheduleEnd = nullptr;      /// The first memory accessing instruction in the scheduling region      /// (can be null). -    ScheduleData *FirstLoadStoreInRegion; +    ScheduleData *FirstLoadStoreInRegion = nullptr;      /// The last memory accessing instruction in the scheduling region      /// (can be null). -    ScheduleData *LastLoadStoreInRegion; +    ScheduleData *LastLoadStoreInRegion = nullptr;      /// The current size of the scheduling region. -    int ScheduleRegionSize; +    int ScheduleRegionSize = 0;      /// The maximum size allowed for the scheduling region. -    int ScheduleRegionSizeLimit; +    int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;      /// The ID of the scheduling region. For a new vectorization iteration this      /// is incremented which "removes" all ScheduleData from the region. -    int SchedulingRegionID; +    // Make sure that the initial SchedulingRegionID is greater than the +    // initial SchedulingRegionID in ScheduleData (which is 0). +    int SchedulingRegionID = 1;    };    /// Attaches the BlockScheduling structures to basic blocks. @@ -940,10 +1245,10 @@ private:    ArrayRef<Value *> UserIgnoreList;    // Number of load bundles that contain consecutive loads. -  int NumLoadsWantToKeepOrder; +  int NumLoadsWantToKeepOrder = 0;    // Number of load bundles that contain consecutive loads in reversed order. -  int NumLoadsWantToChangeOrder; +  int NumLoadsWantToChangeOrder = 0;    // Analysis and block reference.    Function *F; @@ -960,6 +1265,7 @@ private:    unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.    unsigned MinVecRegSize; // Set by cl::opt (default: 128). +    /// Instruction builder to construct the vectorized tree.    IRBuilder<> Builder; @@ -970,20 +1276,20 @@ private:    /// original width.    MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;  }; +  } // end namespace slpvectorizer  template <> struct GraphTraits<BoUpSLP *> { -  typedef BoUpSLP::TreeEntry TreeEntry; +  using TreeEntry = BoUpSLP::TreeEntry;    /// NodeRef has to be a pointer per the GraphWriter. -  typedef TreeEntry *NodeRef; +  using NodeRef = TreeEntry *;    /// \brief Add the VectorizableTree to the index iterator to be able to return    /// TreeEntry pointers.    struct ChildIteratorType        : public iterator_adaptor_base<ChildIteratorType,                                       SmallVector<int, 1>::iterator> { -      std::vector<TreeEntry> &VectorizableTree;      ChildIteratorType(SmallVector<int, 1>::iterator W, @@ -998,17 +1304,19 @@ template <> struct GraphTraits<BoUpSLP *> {    static ChildIteratorType child_begin(NodeRef N) {      return {N->UserTreeIndices.begin(), N->Container};    } +    static ChildIteratorType child_end(NodeRef N) {      return {N->UserTreeIndices.end(), N->Container};    }    /// For the node iterator we just need to turn the TreeEntry iterator into a    /// TreeEntry* iterator so that it dereferences to NodeRef. -  typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator; +  using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;    static nodes_iterator nodes_begin(BoUpSLP *R) {      return nodes_iterator(R->VectorizableTree.begin());    } +    static nodes_iterator nodes_end(BoUpSLP *R) {      return nodes_iterator(R->VectorizableTree.end());    } @@ -1017,7 +1325,7 @@ template <> struct GraphTraits<BoUpSLP *> {  };  template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { -  typedef BoUpSLP::TreeEntry TreeEntry; +  using TreeEntry = BoUpSLP::TreeEntry;    DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} @@ -1054,6 +1362,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,    ExtraValueToDebugLocsMap ExternallyUsedValues;    buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);  } +  void BoUpSLP::buildTree(ArrayRef<Value *> Roots,                          ExtraValueToDebugLocsMap &ExternallyUsedValues,                          ArrayRef<Value *> UserIgnoreLst) { @@ -1118,44 +1427,34 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,  }  void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, -                            int UserTreeIdx) { -  bool isAltShuffle = false; +                            int UserTreeIdx, int OpdNum) {    assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); +  InstructionsState S = getSameOpcode(VL);    if (Depth == RecursionMaxDepth) {      DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); -    newTreeEntry(VL, false, UserTreeIdx); +    newTreeEntry(VL, false, UserTreeIdx, S);      return;    }    // Don't handle vectors. -  if (VL[0]->getType()->isVectorTy()) { +  if (S.OpValue->getType()->isVectorTy()) {      DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); -    newTreeEntry(VL, false, UserTreeIdx); +    newTreeEntry(VL, false, UserTreeIdx, S);      return;    } -  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) +  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))      if (SI->getValueOperand()->getType()->isVectorTy()) {        DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); -      newTreeEntry(VL, false, UserTreeIdx); +      newTreeEntry(VL, false, UserTreeIdx, S);        return;      } -  unsigned Opcode = getSameOpcode(VL); - -  // Check that this shuffle vector refers to the alternate -  // sequence of opcodes. -  if (Opcode == Instruction::ShuffleVector) { -    Instruction *I0 = dyn_cast<Instruction>(VL[0]); -    unsigned Op = I0->getOpcode(); -    if (Op != Instruction::ShuffleVector) -      isAltShuffle = true; -  }    // If all of the operands are identical or constant we have a simple solution. -  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) { +  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {      DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); -    newTreeEntry(VL, false, UserTreeIdx); +    newTreeEntry(VL, false, UserTreeIdx, S);      return;    } @@ -1167,87 +1466,92 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      if (EphValues.count(VL[i])) {        DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<              ") is ephemeral.\n"); -      newTreeEntry(VL, false, UserTreeIdx); +      newTreeEntry(VL, false, UserTreeIdx, S);        return;      }    }    // Check if this is a duplicate of another entry. -  if (TreeEntry *E = getTreeEntry(VL[0])) { +  if (TreeEntry *E = getTreeEntry(S.OpValue)) {      for (unsigned i = 0, e = VL.size(); i != e; ++i) {        DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");        if (E->Scalars[i] != VL[i]) {          DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); -        newTreeEntry(VL, false, UserTreeIdx); +        newTreeEntry(VL, false, UserTreeIdx, S);          return;        }      }      // Record the reuse of the tree node.  FIXME, currently this is only used to      // properly draw the graph rather than for the actual vectorization.      E->UserTreeIndices.push_back(UserTreeIdx); -    DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); +    DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n");      return;    }    // Check that none of the instructions in the bundle are already in the tree.    for (unsigned i = 0, e = VL.size(); i != e; ++i) { -    if (ScalarToTreeEntry.count(VL[i])) { +    auto *I = dyn_cast<Instruction>(VL[i]); +    if (!I) +      continue; +    if (getTreeEntry(I)) {        DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<              ") is already in tree.\n"); -      newTreeEntry(VL, false, UserTreeIdx); +      newTreeEntry(VL, false, UserTreeIdx, S);        return;      }    } -  // If any of the scalars is marked as a value that needs to stay scalar then +  // If any of the scalars is marked as a value that needs to stay scalar, then    // we need to gather the scalars.    for (unsigned i = 0, e = VL.size(); i != e; ++i) {      if (MustGather.count(VL[i])) {        DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); -      newTreeEntry(VL, false, UserTreeIdx); +      newTreeEntry(VL, false, UserTreeIdx, S);        return;      }    }    // Check that all of the users of the scalars that we want to vectorize are    // schedulable. -  Instruction *VL0 = cast<Instruction>(VL[0]); +  auto *VL0 = cast<Instruction>(S.OpValue);    BasicBlock *BB = VL0->getParent();    if (!DT->isReachableFromEntry(BB)) {      // Don't go into unreachable blocks. They may contain instructions with      // dependency cycles which confuse the final scheduling.      DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); -    newTreeEntry(VL, false, UserTreeIdx); +    newTreeEntry(VL, false, UserTreeIdx, S);      return;    } -  // Check that every instructions appears once in this bundle. +  // Check that every instruction appears once in this bundle.    for (unsigned i = 0, e = VL.size(); i < e; ++i) -    for (unsigned j = i+1; j < e; ++j) +    for (unsigned j = i + 1; j < e; ++j)        if (VL[i] == VL[j]) {          DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); -        newTreeEntry(VL, false, UserTreeIdx); +        newTreeEntry(VL, false, UserTreeIdx, S);          return;        }    auto &BSRef = BlocksSchedules[BB]; -  if (!BSRef) { +  if (!BSRef)      BSRef = llvm::make_unique<BlockScheduling>(BB); -  } +    BlockScheduling &BS = *BSRef.get(); -  if (!BS.tryScheduleBundle(VL, this, VL0)) { +  if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {      DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); -    assert((!BS.getScheduleData(VL[0]) || -            !BS.getScheduleData(VL[0])->isPartOfBundle()) && +    assert((!BS.getScheduleData(VL0) || +            !BS.getScheduleData(VL0)->isPartOfBundle()) &&             "tryScheduleBundle should cancelScheduling on failure"); -    newTreeEntry(VL, false, UserTreeIdx); +    newTreeEntry(VL, false, UserTreeIdx, S);      return;    }    DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); -  switch (Opcode) { +  unsigned ShuffleOrOp = S.IsAltShuffle ? +                (unsigned) Instruction::ShuffleVector : S.Opcode; +  switch (ShuffleOrOp) {      case Instruction::PHI: {        PHINode *PH = dyn_cast<PHINode>(VL0); @@ -1259,12 +1563,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            if (Term) {              DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");              BS.cancelScheduling(VL, VL0); -            newTreeEntry(VL, false, UserTreeIdx); +            newTreeEntry(VL, false, UserTreeIdx, S);              return;            }          } -      newTreeEntry(VL, true, UserTreeIdx); +      newTreeEntry(VL, true, UserTreeIdx, S);        DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1274,35 +1578,34 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(                PH->getIncomingBlock(i))); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);        }        return;      }      case Instruction::ExtractValue:      case Instruction::ExtractElement: { -      bool Reuse = canReuseExtract(VL, Opcode); +      bool Reuse = canReuseExtract(VL, VL0);        if (Reuse) {          DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");        } else {          BS.cancelScheduling(VL, VL0);        } -      newTreeEntry(VL, Reuse, UserTreeIdx); +      newTreeEntry(VL, Reuse, UserTreeIdx, S);        return;      }      case Instruction::Load: {        // Check that a vectorized load would load the same memory as a scalar -      // load. -      // For example we don't want vectorize loads that are smaller than 8 bit. -      // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats -      // loading/storing it as an i8 struct. If we vectorize loads/stores from -      // such a struct we read/write packed bits disagreeing with the +      // load. For example, we don't want to vectorize loads that are smaller +      // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM +      // treats loading/storing it as an i8 struct. If we vectorize loads/stores +      // from such a struct, we read/write packed bits disagreeing with the        // unvectorized version. -      Type *ScalarTy = VL[0]->getType(); +      Type *ScalarTy = VL0->getType();        if (DL->getTypeSizeInBits(ScalarTy) !=            DL->getTypeAllocSizeInBits(ScalarTy)) {          BS.cancelScheduling(VL, VL0); -        newTreeEntry(VL, false, UserTreeIdx); +        newTreeEntry(VL, false, UserTreeIdx, S);          DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");          return;        } @@ -1313,15 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          LoadInst *L = cast<LoadInst>(VL[i]);          if (!L->isSimple()) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, S);            DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");            return;          }        }        // Check if the loads are consecutive, reversed, or neither. -      // TODO: What we really want is to sort the loads, but for now, check -      // the two likely directions.        bool Consecutive = true;        bool ReverseConsecutive = true;        for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { @@ -1335,7 +1636,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        if (Consecutive) {          ++NumLoadsWantToKeepOrder; -        newTreeEntry(VL, true, UserTreeIdx); +        newTreeEntry(VL, true, UserTreeIdx, S);          DEBUG(dbgs() << "SLP: added a vector of loads.\n");          return;        } @@ -1349,15 +1650,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,              break;            } -      BS.cancelScheduling(VL, VL0); -      newTreeEntry(VL, false, UserTreeIdx); -        if (ReverseConsecutive) { -        ++NumLoadsWantToChangeOrder;          DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); -      } else { -        DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); +        ++NumLoadsWantToChangeOrder; +        BS.cancelScheduling(VL, VL0); +        newTreeEntry(VL, false, UserTreeIdx, S); +        return; +      } + +      if (VL.size() > 2) { +        bool ShuffledLoads = true; +        SmallVector<Value *, 8> Sorted; +        SmallVector<unsigned, 4> Mask; +        if (sortLoadAccesses(VL, *DL, *SE, Sorted, &Mask)) { +          auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end()); +          for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) { +            if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) { +              ShuffledLoads = false; +              break; +            } +          } +          // TODO: Tracking how many load wants to have arbitrary shuffled order +          // would be usefull. +          if (ShuffledLoads) { +            DEBUG(dbgs() << "SLP: added a vector of loads which needs " +                            "permutation of loaded lanes.\n"); +            newTreeEntry(NewVL, true, UserTreeIdx, S, +                         makeArrayRef(Mask.begin(), Mask.end()), OpdNum); +            return; +          } +        }        } + +      DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); +      BS.cancelScheduling(VL, VL0); +      newTreeEntry(VL, false, UserTreeIdx, S);        return;      }      case Instruction::ZExt: @@ -1377,12 +1704,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();          if (Ty != SrcTy || !isValidElementType(Ty)) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, S);            DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx); +      newTreeEntry(VL, true, UserTreeIdx, S);        DEBUG(dbgs() << "SLP: added a vector of casts.\n");        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1391,7 +1718,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          for (Value *j : VL)            Operands.push_back(cast<Instruction>(j)->getOperand(i)); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);        }        return;      } @@ -1399,19 +1726,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      case Instruction::FCmp: {        // Check that all of the compares have the same predicate.        CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); -      Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); +      Type *ComparedTy = VL0->getOperand(0)->getType();        for (unsigned i = 1, e = VL.size(); i < e; ++i) {          CmpInst *Cmp = cast<CmpInst>(VL[i]);          if (Cmp->getPredicate() != P0 ||              Cmp->getOperand(0)->getType() != ComparedTy) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, S);            DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx); +      newTreeEntry(VL, true, UserTreeIdx, S);        DEBUG(dbgs() << "SLP: added a vector of compares.\n");        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1420,7 +1747,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          for (Value *j : VL)            Operands.push_back(cast<Instruction>(j)->getOperand(i)); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);        }        return;      } @@ -1442,17 +1769,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      case Instruction::AShr:      case Instruction::And:      case Instruction::Or: -    case Instruction::Xor: { -      newTreeEntry(VL, true, UserTreeIdx); +    case Instruction::Xor: +      newTreeEntry(VL, true, UserTreeIdx, S);        DEBUG(dbgs() << "SLP: added a vector of bin op.\n");        // Sort operands of the instructions so that each side is more likely to        // have the same opcode.        if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {          ValueList Left, Right; -        reorderInputsAccordingToOpcode(VL, Left, Right); +        reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);          buildTree_rec(Left, Depth + 1, UserTreeIdx); -        buildTree_rec(Right, Depth + 1, UserTreeIdx); +        buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);          return;        } @@ -1462,30 +1789,30 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          for (Value *j : VL)            Operands.push_back(cast<Instruction>(j)->getOperand(i)); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);        }        return; -    } +      case Instruction::GetElementPtr: {        // We don't combine GEPs with complicated (nested) indexing.        for (unsigned j = 0; j < VL.size(); ++j) {          if (cast<Instruction>(VL[j])->getNumOperands() != 2) {            DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, S);            return;          }        }        // We can't combine several GEPs into one vector if they operate on        // different types. -      Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); +      Type *Ty0 = VL0->getOperand(0)->getType();        for (unsigned j = 0; j < VL.size(); ++j) {          Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();          if (Ty0 != CurTy) {            DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, S);            return;          }        } @@ -1497,12 +1824,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            DEBUG(                dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, S);            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx); +      newTreeEntry(VL, true, UserTreeIdx, S);        DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");        for (unsigned i = 0, e = 2; i < e; ++i) {          ValueList Operands; @@ -1510,7 +1837,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          for (Value *j : VL)            Operands.push_back(cast<Instruction>(j)->getOperand(i)); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);        }        return;      } @@ -1519,12 +1846,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)          if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, S);            DEBUG(dbgs() << "SLP: Non-consecutive store.\n");            return;          } -      newTreeEntry(VL, true, UserTreeIdx); +      newTreeEntry(VL, true, UserTreeIdx, S);        DEBUG(dbgs() << "SLP: added a vector of stores.\n");        ValueList Operands; @@ -1536,13 +1863,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      }      case Instruction::Call: {        // Check if the calls are all to the same vectorizable intrinsic. -      CallInst *CI = cast<CallInst>(VL[0]); +      CallInst *CI = cast<CallInst>(VL0);        // Check if this is an Intrinsic call or something that can be        // represented by an intrinsic call        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);        if (!isTriviallyVectorizable(ID)) {          BS.cancelScheduling(VL, VL0); -        newTreeEntry(VL, false, UserTreeIdx); +        newTreeEntry(VL, false, UserTreeIdx, S);          DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");          return;        } @@ -1556,7 +1883,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,              getVectorIntrinsicIDForCall(CI2, TLI) != ID ||              !CI->hasIdenticalOperandBundleSchema(*CI2)) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, S);            DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]                         << "\n");            return; @@ -1567,7 +1894,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            Value *A1J = CI2->getArgOperand(1);            if (A1I != A1J) {              BS.cancelScheduling(VL, VL0); -            newTreeEntry(VL, false, UserTreeIdx); +            newTreeEntry(VL, false, UserTreeIdx, S);              DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI                           << " argument "<< A1I<<"!=" << A1J                           << "\n"); @@ -1580,14 +1907,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,                          CI->op_begin() + CI->getBundleOperandsEndIndex(),                          CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, S);            DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="                         << *VL[i] << '\n');            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx); +      newTreeEntry(VL, true, UserTreeIdx, S);        for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {          ValueList Operands;          // Prepare the operand vector. @@ -1595,28 +1922,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            CallInst *CI2 = dyn_cast<CallInst>(j);            Operands.push_back(CI2->getArgOperand(i));          } -        buildTree_rec(Operands, Depth + 1, UserTreeIdx); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);        }        return;      } -    case Instruction::ShuffleVector: { +    case Instruction::ShuffleVector:        // If this is not an alternate sequence of opcode like add-sub        // then do not vectorize this instruction. -      if (!isAltShuffle) { +      if (!S.IsAltShuffle) {          BS.cancelScheduling(VL, VL0); -        newTreeEntry(VL, false, UserTreeIdx); +        newTreeEntry(VL, false, UserTreeIdx, S);          DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");          return;        } -      newTreeEntry(VL, true, UserTreeIdx); +      newTreeEntry(VL, true, UserTreeIdx, S);        DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");        // Reorder operands if reordering would enable vectorization.        if (isa<BinaryOperator>(VL0)) {          ValueList Left, Right; -        reorderAltShuffleOperands(VL, Left, Right); +        reorderAltShuffleOperands(S.Opcode, VL, Left, Right);          buildTree_rec(Left, Depth + 1, UserTreeIdx); -        buildTree_rec(Right, Depth + 1, UserTreeIdx); +        buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);          return;        } @@ -1626,13 +1953,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          for (Value *j : VL)            Operands.push_back(cast<Instruction>(j)->getOperand(i)); -        buildTree_rec(Operands, Depth + 1, UserTreeIdx); +        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);        }        return; -    } +      default:        BS.cancelScheduling(VL, VL0); -      newTreeEntry(VL, false, UserTreeIdx); +      newTreeEntry(VL, false, UserTreeIdx, S);        DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");        return;    } @@ -1663,19 +1990,18 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {    return N;  } -bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const { -  assert(Opcode == Instruction::ExtractElement || -         Opcode == Instruction::ExtractValue); -  assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); +bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const { +  Instruction *E0 = cast<Instruction>(OpValue); +  assert(E0->getOpcode() == Instruction::ExtractElement || +         E0->getOpcode() == Instruction::ExtractValue); +  assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode");    // Check if all of the extracts come from the same vector and from the    // correct offset. -  Value *VL0 = VL[0]; -  Instruction *E0 = cast<Instruction>(VL0);    Value *Vec = E0->getOperand(0);    // We have to extract from a vector/aggregate with the same number of elements.    unsigned NElts; -  if (Opcode == Instruction::ExtractValue) { +  if (E0->getOpcode() == Instruction::ExtractValue) {      const DataLayout &DL = E0->getModule()->getDataLayout();      NElts = canMapToVector(Vec->getType(), DL);      if (!NElts) @@ -1692,20 +2018,24 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {      return false;    // Check that all of the indices extract from the correct offset. -  if (!matchExtractIndex(E0, 0, Opcode)) -    return false; - -  for (unsigned i = 1, e = VL.size(); i < e; ++i) { -    Instruction *E = cast<Instruction>(VL[i]); -    if (!matchExtractIndex(E, i, Opcode)) +  for (unsigned I = 0, E = VL.size(); I < E; ++I) { +    Instruction *Inst = cast<Instruction>(VL[I]); +    if (!matchExtractIndex(Inst, I, Inst->getOpcode()))        return false; -    if (E->getOperand(0) != Vec) +    if (Inst->getOperand(0) != Vec)        return false;    }    return true;  } +bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { +  return I->hasOneUse() || +         std::all_of(I->user_begin(), I->user_end(), [this](User *U) { +           return ScalarToTreeEntry.count(U) > 0; +         }); +} +  int BoUpSLP::getEntryCost(TreeEntry *E) {    ArrayRef<Value*> VL = E->Scalars; @@ -1728,28 +2058,47 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {      if (isSplat(VL)) {        return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);      } +    if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) { +      Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); +      if (ShuffleKind.hasValue()) { +        int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); +        for (auto *V : VL) { +          // If all users of instruction are going to be vectorized and this +          // instruction itself is not going to be vectorized, consider this +          // instruction as dead and remove its cost from the final cost of the +          // vectorized tree. +          if (areAllUsersVectorized(cast<Instruction>(V)) && +              !ScalarToTreeEntry.count(V)) { +            auto *IO = cast<ConstantInt>( +                cast<ExtractElementInst>(V)->getIndexOperand()); +            Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, +                                            IO->getZExtValue()); +          } +        } +        return Cost; +      } +    }      return getGatherCost(E->Scalars);    } -  unsigned Opcode = getSameOpcode(VL); -  assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); -  Instruction *VL0 = cast<Instruction>(VL[0]); -  switch (Opcode) { -    case Instruction::PHI: { +  InstructionsState S = getSameOpcode(VL); +  assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); +  Instruction *VL0 = cast<Instruction>(S.OpValue); +  unsigned ShuffleOrOp = S.IsAltShuffle ? +               (unsigned) Instruction::ShuffleVector : S.Opcode; +  switch (ShuffleOrOp) { +    case Instruction::PHI:        return 0; -    } +      case Instruction::ExtractValue: -    case Instruction::ExtractElement: { -      if (canReuseExtract(VL, Opcode)) { +    case Instruction::ExtractElement: +      if (canReuseExtract(VL, S.OpValue)) {          int DeadCost = 0;          for (unsigned i = 0, e = VL.size(); i < e; ++i) {            Instruction *E = cast<Instruction>(VL[i]);            // If all users are going to be vectorized, instruction can be            // considered as dead.            // The same, if have only one user, it will be vectorized for sure. -          if (E->hasOneUse() || -              std::all_of(E->user_begin(), E->user_end(), [this](User *U) { -                return ScalarToTreeEntry.count(U) > 0; -              })) +          if (areAllUsersVectorized(E))              // Take credit for instruction that will become dead.              DeadCost +=                  TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); @@ -1757,7 +2106,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {          return -DeadCost;        }        return getGatherCost(VecTy); -    } +      case Instruction::ZExt:      case Instruction::SExt:      case Instruction::FPToUI: @@ -1786,8 +2135,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {        // Calculate the cost of this instruction.        VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());        int ScalarCost = VecTy->getNumElements() * -          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0); -      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0); +          TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0); +      int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);        return VecCost - ScalarCost;      }      case Instruction::Add: @@ -1848,9 +2197,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {        SmallVector<const Value *, 4> Operands(VL0->operand_values());        int ScalarCost =            VecTy->getNumElements() * -          TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, +          TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,                                        Op2VP, Operands); -      int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, +      int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,                                                  Op1VP, Op2VP, Operands);        return VecCost - ScalarCost;      } @@ -1968,7 +2317,6 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {  }  bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() { -    // We can vectorize the tree if its size is greater than or equal to the    // minimum size specified by the MinTreeSize command line option.    if (VectorizableTree.size() >= MinTreeSize) @@ -2139,13 +2487,18 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {  // load a[3] + load b[3]  // Reordering the second load b[1]  load a[1] would allow us to vectorize this  // code. -void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, +void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,                                          SmallVectorImpl<Value *> &Left,                                          SmallVectorImpl<Value *> &Right) {    // Push left and right operands of binary operation into Left and Right -  for (Value *i : VL) { -    Left.push_back(cast<Instruction>(i)->getOperand(0)); -    Right.push_back(cast<Instruction>(i)->getOperand(1)); +  unsigned AltOpcode = getAltOpcode(Opcode); +  (void)AltOpcode; +  for (Value *V : VL) { +    auto *I = cast<Instruction>(V); +    assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && +           "Incorrect instruction in vector"); +    Left.push_back(I->getOperand(0)); +    Right.push_back(I->getOperand(1));    }    // Reorder if we have a commutative operation and consecutive access @@ -2190,14 +2543,12 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,  // The vectorizer is trying to either have all elements one side being  // instruction with the same opcode to enable further vectorization, or having  // a splat to lower the vectorizing cost. -static bool shouldReorderOperands(int i, Instruction &I, -                                  SmallVectorImpl<Value *> &Left, -                                  SmallVectorImpl<Value *> &Right, -                                  bool AllSameOpcodeLeft, -                                  bool AllSameOpcodeRight, bool SplatLeft, -                                  bool SplatRight) { -  Value *VLeft = I.getOperand(0); -  Value *VRight = I.getOperand(1); +static bool shouldReorderOperands( +    int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left, +    ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, +    bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { +  VLeft = I.getOperand(0); +  VRight = I.getOperand(1);    // If we have "SplatRight", try to see if commuting is needed to preserve it.    if (SplatRight) {      if (VRight == Right[i - 1]) @@ -2253,15 +2604,16 @@ static bool shouldReorderOperands(int i, Instruction &I,    return false;  } -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, +void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, +                                             ArrayRef<Value *> VL,                                               SmallVectorImpl<Value *> &Left,                                               SmallVectorImpl<Value *> &Right) { - -  if (VL.size()) { +  if (!VL.empty()) {      // Peel the first iteration out of the loop since there's nothing      // interesting to do anyway and it simplifies the checks in the loop. -    auto VLeft = cast<Instruction>(VL[0])->getOperand(0); -    auto VRight = cast<Instruction>(VL[0])->getOperand(1); +    auto *I = cast<Instruction>(VL[0]); +    Value *VLeft = I->getOperand(0); +    Value *VRight = I->getOperand(1);      if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))        // Favor having instruction to the right. FIXME: why?        std::swap(VLeft, VRight); @@ -2278,16 +2630,21 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,    for (unsigned i = 1, e = VL.size(); i != e; ++i) {      Instruction *I = cast<Instruction>(VL[i]); -    assert(I->isCommutative() && "Can only process commutative instruction"); +    assert(((I->getOpcode() == Opcode && I->isCommutative()) || +            (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && +           "Can only process commutative instruction");      // Commute to favor either a splat or maximizing having the same opcodes on      // one side. -    if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, -                              AllSameOpcodeRight, SplatLeft, SplatRight)) { -      Left.push_back(I->getOperand(1)); -      Right.push_back(I->getOperand(0)); +    Value *VLeft; +    Value *VRight; +    if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft, +                              AllSameOpcodeRight, SplatLeft, SplatRight, VLeft, +                              VRight)) { +      Left.push_back(VRight); +      Right.push_back(VLeft);      } else { -      Left.push_back(I->getOperand(0)); -      Right.push_back(I->getOperand(1)); +      Left.push_back(VLeft); +      Right.push_back(VRight);      }      // Update Splat* and AllSameOpcode* after the insertion.      SplatRight = SplatRight && (Right[i - 1] == Right[i]); @@ -2340,14 +2697,17 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,    }  } -void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { - +void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {    // Get the basic block this bundle is in. All instructions in the bundle    // should be in this block. -  auto *Front = cast<Instruction>(VL.front()); +  auto *Front = cast<Instruction>(OpValue);    auto *BB = Front->getParent(); -  assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool { -    return cast<Instruction>(V)->getParent() == BB; +  const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode(); +  const unsigned AltOpcode = getAltOpcode(Opcode); +  assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { +    return !sameOpcodeOrAlt(Opcode, AltOpcode, +                            cast<Instruction>(V)->getOpcode()) || +           cast<Instruction>(V)->getParent() == BB;    }));    // The last instruction in the bundle in program order. @@ -2358,10 +2718,12 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {    // VL.back() and iterate over schedule data until we reach the end of the    // bundle. The end of the bundle is marked by null ScheduleData.    if (BlocksSchedules.count(BB)) { -    auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back()); +    auto *Bundle = +        BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));      if (Bundle && Bundle->isPartOfBundle())        for (; Bundle; Bundle = Bundle->NextInBundle) -        LastInst = Bundle->Inst; +        if (Bundle->OpValue == Bundle->Inst) +          LastInst = Bundle->Inst;    }    // LastInst can still be null at this point if there's either not an entry @@ -2385,7 +2747,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {    if (!LastInst) {      SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());      for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { -      if (Bundle.erase(&I)) +      if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))          LastInst = &I;        if (Bundle.empty())          break; @@ -2435,27 +2797,41 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {    return nullptr;  } -Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { -  if (TreeEntry *E = getTreeEntry(VL[0])) -    if (E->isSame(VL)) -      return vectorizeTree(E); +Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) { +  InstructionsState S = getSameOpcode(VL); +  if (S.Opcode) { +    if (TreeEntry *E = getTreeEntry(S.OpValue)) { +      TreeEntry *UserTreeEntry = nullptr; +      if (UserIndx != -1) +        UserTreeEntry = &VectorizableTree[UserIndx]; + +      if (E->isSame(VL) || +          (UserTreeEntry && +           (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() && +           !UserTreeEntry->ShuffleMask[OpdNum].empty() && +           E->isFoundJumbled(VL, *DL, *SE))) +        return vectorizeTree(E, OpdNum, UserIndx); +    } +  } -  Type *ScalarTy = VL[0]->getType(); -  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) +  Type *ScalarTy = S.OpValue->getType(); +  if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))      ScalarTy = SI->getValueOperand()->getType();    VectorType *VecTy = VectorType::get(ScalarTy, VL.size());    return Gather(VL, VecTy);  } -Value *BoUpSLP::vectorizeTree(TreeEntry *E) { +Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {    IRBuilder<>::InsertPointGuard Guard(Builder); +  TreeEntry *UserTreeEntry = nullptr;    if (E->VectorizedValue) {      DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");      return E->VectorizedValue;    } +  InstructionsState S = getSameOpcode(E->Scalars);    Instruction *VL0 = cast<Instruction>(E->Scalars[0]);    Type *ScalarTy = VL0->getType();    if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) @@ -2463,15 +2839,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {    VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());    if (E->NeedToGather) { -    setInsertPointAfterBundle(E->Scalars); +    setInsertPointAfterBundle(E->Scalars, VL0);      auto *V = Gather(E->Scalars, VecTy);      E->VectorizedValue = V;      return V;    } -  unsigned Opcode = getSameOpcode(E->Scalars); +  assert(ScalarToTreeEntry.count(E->Scalars[0]) && +         "Expected user tree entry, missing!"); +  int CurrIndx = ScalarToTreeEntry[E->Scalars[0]]; -  switch (Opcode) { +  unsigned ShuffleOrOp = S.IsAltShuffle ? +           (unsigned) Instruction::ShuffleVector : S.Opcode; +  switch (ShuffleOrOp) {      case Instruction::PHI: {        PHINode *PH = dyn_cast<PHINode>(VL0);        Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); @@ -2498,7 +2878,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          Builder.SetInsertPoint(IBB->getTerminator());          Builder.SetCurrentDebugLocation(PH->getDebugLoc()); -        Value *Vec = vectorizeTree(Operands); +        Value *Vec = vectorizeTree(Operands, i, CurrIndx);          NewPhi->addIncoming(Vec, IBB);        } @@ -2508,18 +2888,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {      }      case Instruction::ExtractElement: { -      if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) { +      if (canReuseExtract(E->Scalars, VL0)) {          Value *V = VL0->getOperand(0);          E->VectorizedValue = V;          return V;        } -      setInsertPointAfterBundle(E->Scalars); +      setInsertPointAfterBundle(E->Scalars, VL0);        auto *V = Gather(E->Scalars, VecTy);        E->VectorizedValue = V;        return V;      }      case Instruction::ExtractValue: { -      if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) { +      if (canReuseExtract(E->Scalars, VL0)) {          LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));          Builder.SetInsertPoint(LI);          PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); @@ -2528,7 +2908,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          E->VectorizedValue = V;          return propagateMetadata(V, E->Scalars);        } -      setInsertPointAfterBundle(E->Scalars); +      setInsertPointAfterBundle(E->Scalars, VL0);        auto *V = Gather(E->Scalars, VecTy);        E->VectorizedValue = V;        return V; @@ -2549,9 +2929,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {        for (Value *V : E->Scalars)          INVL.push_back(cast<Instruction>(V)->getOperand(0)); -      setInsertPointAfterBundle(E->Scalars); +      setInsertPointAfterBundle(E->Scalars, VL0); -      Value *InVec = vectorizeTree(INVL); +      Value *InVec = vectorizeTree(INVL, 0, CurrIndx);        if (Value *V = alreadyVectorized(E->Scalars, VL0))          return V; @@ -2570,23 +2950,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          RHSV.push_back(cast<Instruction>(V)->getOperand(1));        } -      setInsertPointAfterBundle(E->Scalars); +      setInsertPointAfterBundle(E->Scalars, VL0); -      Value *L = vectorizeTree(LHSV); -      Value *R = vectorizeTree(RHSV); +      Value *L = vectorizeTree(LHSV, 0, CurrIndx); +      Value *R = vectorizeTree(RHSV, 1, CurrIndx);        if (Value *V = alreadyVectorized(E->Scalars, VL0))          return V;        CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();        Value *V; -      if (Opcode == Instruction::FCmp) +      if (S.Opcode == Instruction::FCmp)          V = Builder.CreateFCmp(P0, L, R);        else          V = Builder.CreateICmp(P0, L, R);        E->VectorizedValue = V; -      propagateIRFlags(E->VectorizedValue, E->Scalars); +      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);        ++NumVectorInstructions;        return V;      } @@ -2598,11 +2978,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          FalseVec.push_back(cast<Instruction>(V)->getOperand(2));        } -      setInsertPointAfterBundle(E->Scalars); +      setInsertPointAfterBundle(E->Scalars, VL0); -      Value *Cond = vectorizeTree(CondVec); -      Value *True = vectorizeTree(TrueVec); -      Value *False = vectorizeTree(FalseVec); +      Value *Cond = vectorizeTree(CondVec, 0, CurrIndx); +      Value *True = vectorizeTree(TrueVec, 1, CurrIndx); +      Value *False = vectorizeTree(FalseVec, 2, CurrIndx);        if (Value *V = alreadyVectorized(E->Scalars, VL0))          return V; @@ -2632,25 +3012,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {      case Instruction::Xor: {        ValueList LHSVL, RHSVL;        if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) -        reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); +        reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL, +                                       RHSVL);        else          for (Value *V : E->Scalars) { -          LHSVL.push_back(cast<Instruction>(V)->getOperand(0)); -          RHSVL.push_back(cast<Instruction>(V)->getOperand(1)); +          auto *I = cast<Instruction>(V); +          LHSVL.push_back(I->getOperand(0)); +          RHSVL.push_back(I->getOperand(1));          } -      setInsertPointAfterBundle(E->Scalars); +      setInsertPointAfterBundle(E->Scalars, VL0); -      Value *LHS = vectorizeTree(LHSVL); -      Value *RHS = vectorizeTree(RHSVL); +      Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx); +      Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);        if (Value *V = alreadyVectorized(E->Scalars, VL0))          return V; -      BinaryOperator *BinOp = cast<BinaryOperator>(VL0); -      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); +      Value *V = Builder.CreateBinOp( +          static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);        E->VectorizedValue = V; -      propagateIRFlags(E->VectorizedValue, E->Scalars); +      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);        ++NumVectorInstructions;        if (Instruction *I = dyn_cast<Instruction>(V)) @@ -2661,9 +3043,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {      case Instruction::Load: {        // Loads are inserted at the head of the tree because we don't want to        // sink them all the way down past store instructions. -      setInsertPointAfterBundle(E->Scalars); +      setInsertPointAfterBundle(E->Scalars, VL0); + +      if (UserIndx != -1) +        UserTreeEntry = &VectorizableTree[UserIndx]; + +      bool isJumbled = false; +      LoadInst *LI = NULL; +      if (UserTreeEntry && +          (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() && +          !UserTreeEntry->ShuffleMask[OpdNum].empty()) { +        isJumbled = true; +        LI = cast<LoadInst>(E->Scalars[0]); +      } else { +        LI = cast<LoadInst>(VL0); +      } -      LoadInst *LI = cast<LoadInst>(VL0);        Type *ScalarLoadTy = LI->getType();        unsigned AS = LI->getPointerAddressSpace(); @@ -2685,47 +3080,60 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {        LI->setAlignment(Alignment);        E->VectorizedValue = LI;        ++NumVectorInstructions; -      return propagateMetadata(LI, E->Scalars); +      propagateMetadata(LI, E->Scalars); + +      if (isJumbled) { +        SmallVector<Constant *, 8> Mask; +        for (unsigned LaneEntry : UserTreeEntry->ShuffleMask[OpdNum]) +          Mask.push_back(Builder.getInt32(LaneEntry)); +        // Generate shuffle for jumbled memory access +        Value *Undef = UndefValue::get(VecTy); +        Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef, +                                                  ConstantVector::get(Mask)); +        E->VectorizedValue = Shuf; +        ++NumVectorInstructions; +        return Shuf; +      } +      return LI;      }      case Instruction::Store: {        StoreInst *SI = cast<StoreInst>(VL0);        unsigned Alignment = SI->getAlignment();        unsigned AS = SI->getPointerAddressSpace(); -      ValueList ValueOp; +      ValueList ScalarStoreValues;        for (Value *V : E->Scalars) -        ValueOp.push_back(cast<StoreInst>(V)->getValueOperand()); +        ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand()); -      setInsertPointAfterBundle(E->Scalars); +      setInsertPointAfterBundle(E->Scalars, VL0); -      Value *VecValue = vectorizeTree(ValueOp); -      Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), -                                            VecTy->getPointerTo(AS)); +      Value *VecValue = vectorizeTree(ScalarStoreValues, 0, CurrIndx); +      Value *ScalarPtr = SI->getPointerOperand(); +      Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));        StoreInst *S = Builder.CreateStore(VecValue, VecPtr); -      // The pointer operand uses an in-tree scalar so we add the new BitCast to -      // ExternalUses list to make sure that an extract will be generated in the +      // The pointer operand uses an in-tree scalar, so add the new BitCast to +      // ExternalUses to make sure that an extract will be generated in the        // future. -      Value *PO = SI->getPointerOperand(); -      if (getTreeEntry(PO)) -        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); +      if (getTreeEntry(ScalarPtr)) +        ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0)); -      if (!Alignment) { +      if (!Alignment)          Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); -      } +        S->setAlignment(Alignment);        E->VectorizedValue = S;        ++NumVectorInstructions;        return propagateMetadata(S, E->Scalars);      }      case Instruction::GetElementPtr: { -      setInsertPointAfterBundle(E->Scalars); +      setInsertPointAfterBundle(E->Scalars, VL0);        ValueList Op0VL;        for (Value *V : E->Scalars)          Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0)); -      Value *Op0 = vectorizeTree(Op0VL); +      Value *Op0 = vectorizeTree(Op0VL, 0, CurrIndx);        std::vector<Value *> OpVecs;        for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; @@ -2734,7 +3142,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          for (Value *V : E->Scalars)            OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j)); -        Value *OpVec = vectorizeTree(OpVL); +        Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);          OpVecs.push_back(OpVec);        } @@ -2750,7 +3158,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {      }      case Instruction::Call: {        CallInst *CI = cast<CallInst>(VL0); -      setInsertPointAfterBundle(E->Scalars); +      setInsertPointAfterBundle(E->Scalars, VL0);        Function *FI;        Intrinsic::ID IID  = Intrinsic::not_intrinsic;        Value *ScalarArg = nullptr; @@ -2763,7 +3171,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          // ctlz,cttz and powi are special intrinsics whose second argument is          // a scalar. This argument should not be vectorized.          if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { -          CallInst *CEI = cast<CallInst>(E->Scalars[0]); +          CallInst *CEI = cast<CallInst>(VL0);            ScalarArg = CEI->getArgOperand(j);            OpVecs.push_back(CEI->getArgOperand(j));            continue; @@ -2773,7 +3181,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {            OpVL.push_back(CEI->getArgOperand(j));          } -        Value *OpVec = vectorizeTree(OpVL); +        Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);          DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");          OpVecs.push_back(OpVec);        } @@ -2793,30 +3201,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));        E->VectorizedValue = V; -      propagateIRFlags(E->VectorizedValue, E->Scalars); +      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);        ++NumVectorInstructions;        return V;      }      case Instruction::ShuffleVector: {        ValueList LHSVL, RHSVL; -      assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"); -      reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); -      setInsertPointAfterBundle(E->Scalars); +      assert(Instruction::isBinaryOp(S.Opcode) && +             "Invalid Shuffle Vector Operand"); +      reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL); +      setInsertPointAfterBundle(E->Scalars, VL0); -      Value *LHS = vectorizeTree(LHSVL); -      Value *RHS = vectorizeTree(RHSVL); +      Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx); +      Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);        if (Value *V = alreadyVectorized(E->Scalars, VL0))          return V;        // Create a vector of LHS op1 RHS -      BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); -      Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); +      Value *V0 = Builder.CreateBinOp( +          static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS); +      unsigned AltOpcode = getAltOpcode(S.Opcode);        // Create a vector of LHS op2 RHS -      Instruction *VL1 = cast<Instruction>(E->Scalars[1]); -      BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); -      Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); +      Value *V1 = Builder.CreateBinOp( +          static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);        // Create shuffle to take alternate operations from the vector.        // Also, gather up odd and even scalar ops to propagate IR flags to @@ -2859,7 +3268,6 @@ Value *BoUpSLP::vectorizeTree() {  Value *  BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { -    // All blocks must be scheduled before any instructions are inserted.    for (auto &BSIter : BlocksSchedules) {      scheduleBlock(BSIter.second.get()); @@ -2905,9 +3313,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {        continue;      TreeEntry *E = getTreeEntry(Scalar);      assert(E && "Invalid scalar"); -    assert(!E->NeedToGather && "Extracting from a gather list"); +    assert((!E->NeedToGather) && "Extracting from a gather list"); -    Value *Vec = E->VectorizedValue; +    Value *Vec = dyn_cast<ShuffleVectorInst>(E->VectorizedValue); +    if (Vec && dyn_cast<LoadInst>(cast<Instruction>(Vec)->getOperand(0))) { +      Vec = cast<Instruction>(E->VectorizedValue)->getOperand(0); +    } else { +      Vec = E->VectorizedValue; +    }      assert(Vec && "Can't find vectorizable value");      Value *Lane = Builder.getInt32(ExternalUse.Lane); @@ -2975,14 +3388,15 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {    for (TreeEntry &EIdx : VectorizableTree) {      TreeEntry *Entry = &EIdx; +    // No need to handle users of gathered values. +    if (Entry->NeedToGather) +      continue; + +    assert(Entry->VectorizedValue && "Can't find vectorizable value"); +      // For each lane:      for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {        Value *Scalar = Entry->Scalars[Lane]; -      // No need to handle users of gathered values. -      if (Entry->NeedToGather) -        continue; - -      assert(Entry->VectorizedValue && "Can't find vectorizable value");        Type *Ty = Scalar->getType();        if (!Ty->isVoidTy()) { @@ -2990,9 +3404,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {          for (User *U : Scalar->users()) {            DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); -          assert((getTreeEntry(U) || -                  // It is legal to replace users in the ignorelist by undef. -                  is_contained(UserIgnoreList, U)) && +          // It is legal to replace users in the ignorelist by undef. +          assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&                   "Replacing out-of-tree value with undef");          }  #endif @@ -3009,7 +3422,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {    return VectorizableTree[0].VectorizedValue;  } -void BoUpSLP::optimizeGatherSequence() { +void BoUpSLP::optimizeGatherSequence(Function &F) {    DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()          << " gather sequences instructions.\n");    // LICM InsertElementInst sequences. @@ -3043,30 +3456,16 @@ void BoUpSLP::optimizeGatherSequence() {      Insert->moveBefore(PreHeader->getTerminator());    } -  // Make a list of all reachable blocks in our CSE queue. -  SmallVector<const DomTreeNode *, 8> CSEWorkList; -  CSEWorkList.reserve(CSEBlocks.size()); -  for (BasicBlock *BB : CSEBlocks) -    if (DomTreeNode *N = DT->getNode(BB)) { -      assert(DT->isReachableFromEntry(N)); -      CSEWorkList.push_back(N); -    } - -  // Sort blocks by domination. This ensures we visit a block after all blocks -  // dominating it are visited. -  std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), -                   [this](const DomTreeNode *A, const DomTreeNode *B) { -    return DT->properlyDominates(A, B); -  }); -    // Perform O(N^2) search over the gather sequences and merge identical    // instructions. TODO: We can further optimize this scan if we split the    // instructions into different buckets based on the insert lane.    SmallVector<Instruction *, 16> Visited; -  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { -    assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && -           "Worklist not sorted properly!"); -    BasicBlock *BB = (*I)->getBlock(); +  ReversePostOrderTraversal<Function *> RPOT(&F); +  for (auto BB : RPOT) { +    // Traverse CSEBlocks by RPOT order. +    if (!CSEBlocks.count(BB)) +      continue; +      // For all instructions in blocks containing gather sequences:      for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {        Instruction *In = &*it++; @@ -3111,7 +3510,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,    // Make sure that the scheduling region contains all    // instructions of the bundle.    for (Value *V : VL) { -    if (!extendSchedulingRegion(V)) +    if (!extendSchedulingRegion(V, OpValue))        return false;    } @@ -3148,8 +3547,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,      // It is seldom that this needs to be done a second time after adding the      // initial bundle to the region.      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { -      ScheduleData *SD = getScheduleData(I); -      SD->clearDependencies(); +      doForAllOpcodes(I, [](ScheduleData *SD) { +        SD->clearDependencies(); +      });      }      ReSchedule = true;    } @@ -3210,17 +3610,43 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,    }  } -bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { -  if (getScheduleData(V)) +BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { +  // Allocate a new ScheduleData for the instruction. +  if (ChunkPos >= ChunkSize) { +    ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize)); +    ChunkPos = 0; +  } +  return &(ScheduleDataChunks.back()[ChunkPos++]); +} + +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, +                                                      Value *OpValue) { +  if (getScheduleData(V, isOneOf(OpValue, V)))      return true;    Instruction *I = dyn_cast<Instruction>(V);    assert(I && "bundle member must be an instruction");    assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); +  auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool { +    ScheduleData *ISD = getScheduleData(I); +    if (!ISD) +      return false; +    assert(isInSchedulingRegion(ISD) && +           "ScheduleData not in scheduling region"); +    ScheduleData *SD = allocateScheduleDataChunks(); +    SD->Inst = I; +    SD->init(SchedulingRegionID, OpValue); +    ExtraScheduleDataMap[I][OpValue] = SD; +    return true; +  }; +  if (CheckSheduleForI(I)) +    return true;    if (!ScheduleStart) {      // It's the first instruction in the new region.      initScheduleData(I, I->getNextNode(), nullptr, nullptr);      ScheduleStart = I;      ScheduleEnd = I->getNextNode(); +    if (isOneOf(OpValue, I) != I) +      CheckSheduleForI(I);      assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");      DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");      return true; @@ -3232,7 +3658,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {    BasicBlock::reverse_iterator UpperEnd = BB->rend();    BasicBlock::iterator DownIter = ScheduleEnd->getIterator();    BasicBlock::iterator LowerEnd = BB->end(); -  for (;;) { +  while (true) {      if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {        DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");        return false; @@ -3242,6 +3668,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {        if (&*UpIter == I) {          initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);          ScheduleStart = I; +        if (isOneOf(OpValue, I) != I) +          CheckSheduleForI(I);          DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");          return true;        } @@ -3252,6 +3680,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {          initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,                           nullptr);          ScheduleEnd = I->getNextNode(); +        if (isOneOf(OpValue, I) != I) +          CheckSheduleForI(I);          assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");          DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");          return true; @@ -3272,21 +3702,17 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,    for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {      ScheduleData *SD = ScheduleDataMap[I];      if (!SD) { -      // Allocate a new ScheduleData for the instruction. -      if (ChunkPos >= ChunkSize) { -        ScheduleDataChunks.push_back( -            llvm::make_unique<ScheduleData[]>(ChunkSize)); -        ChunkPos = 0; -      } -      SD = &(ScheduleDataChunks.back()[ChunkPos++]); +      SD = allocateScheduleDataChunks();        ScheduleDataMap[I] = SD;        SD->Inst = I;      }      assert(!isInSchedulingRegion(SD) &&             "new ScheduleData already in scheduling region"); -    SD->init(SchedulingRegionID); +    SD->init(SchedulingRegionID, I); -    if (I->mayReadOrWriteMemory()) { +    if (I->mayReadOrWriteMemory() && +        (!isa<IntrinsicInst>(I) || +         cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {        // Update the linked list of memory accessing instructions.        if (CurrentLoadStore) {          CurrentLoadStore->NextLoadStore = SD; @@ -3326,23 +3752,35 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,          BundleMember->resetUnscheduledDeps();          // Handle def-use chain dependencies. -        for (User *U : BundleMember->Inst->users()) { -          if (isa<Instruction>(U)) { -            ScheduleData *UseSD = getScheduleData(U); -            if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { +        if (BundleMember->OpValue != BundleMember->Inst) { +          ScheduleData *UseSD = getScheduleData(BundleMember->Inst); +          if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { +            BundleMember->Dependencies++; +            ScheduleData *DestBundle = UseSD->FirstInBundle; +            if (!DestBundle->IsScheduled) +              BundleMember->incrementUnscheduledDeps(1); +            if (!DestBundle->hasValidDependencies()) +              WorkList.push_back(DestBundle); +          } +        } else { +          for (User *U : BundleMember->Inst->users()) { +            if (isa<Instruction>(U)) { +              ScheduleData *UseSD = getScheduleData(U); +              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { +                BundleMember->Dependencies++; +                ScheduleData *DestBundle = UseSD->FirstInBundle; +                if (!DestBundle->IsScheduled) +                  BundleMember->incrementUnscheduledDeps(1); +                if (!DestBundle->hasValidDependencies()) +                  WorkList.push_back(DestBundle); +              } +            } else { +              // I'm not sure if this can ever happen. But we need to be safe. +              // This lets the instruction/bundle never be scheduled and +              // eventually disable vectorization.                BundleMember->Dependencies++; -              ScheduleData *DestBundle = UseSD->FirstInBundle; -              if (!DestBundle->IsScheduled) -                BundleMember->incrementUnscheduledDeps(1); -              if (!DestBundle->hasValidDependencies()) -                WorkList.push_back(DestBundle); +              BundleMember->incrementUnscheduledDeps(1);              } -          } else { -            // I'm not sure if this can ever happen. But we need to be safe. -            // This lets the instruction/bundle never be scheduled and -            // eventually disable vectorization. -            BundleMember->Dependencies++; -            BundleMember->incrementUnscheduledDeps(1);            }          } @@ -3419,16 +3857,17 @@ void BoUpSLP::BlockScheduling::resetSchedule() {    assert(ScheduleStart &&           "tried to reset schedule on block which has not been scheduled");    for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { -    ScheduleData *SD = getScheduleData(I); -    assert(isInSchedulingRegion(SD)); -    SD->IsScheduled = false; -    SD->resetUnscheduledDeps(); +    doForAllOpcodes(I, [&](ScheduleData *SD) { +      assert(isInSchedulingRegion(SD) && +             "ScheduleData not in scheduling region"); +      SD->IsScheduled = false; +      SD->resetUnscheduledDeps(); +    });    }    ReadyInsts.clear();  }  void BoUpSLP::scheduleBlock(BlockScheduling *BS) { -    if (!BS->ScheduleStart)      return; @@ -3452,15 +3891,16 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {    int NumToSchedule = 0;    for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;         I = I->getNextNode()) { -    ScheduleData *SD = BS->getScheduleData(I); -    assert( -        SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && -        "scheduler and vectorizer have different opinion on what is a bundle"); -    SD->FirstInBundle->SchedulingPriority = Idx++; -    if (SD->isSchedulingEntity()) { -      BS->calculateDependencies(SD, false, this); -      NumToSchedule++; -    } +    BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { +      assert(SD->isPartOfBundle() == +                 (getTreeEntry(SD->Inst) != nullptr) && +             "scheduler and vectorizer bundle mismatch"); +      SD->FirstInBundle->SchedulingPriority = Idx++; +      if (SD->isSchedulingEntity()) { +        BS->calculateDependencies(SD, false, this); +        NumToSchedule++; +      } +    });    }    BS->initialFillReadyList(ReadyInsts); @@ -3559,7 +3999,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {  static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,                                    SmallVectorImpl<Value *> &ToDemote,                                    SmallVectorImpl<Value *> &Roots) { -    // We can always demote constants.    if (isa<Constant>(V)) {      ToDemote.push_back(V); @@ -3702,7 +4141,7 @@ void BoUpSLP::computeMinimumValueSizes() {      // Determine if the sign bit of all the roots is known to be zero. If not,      // IsKnownPositive is set to False. -    IsKnownPositive = all_of(TreeRoot, [&](Value *R) { +    IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {        KnownBits Known = computeKnownBits(R, *DL);        return Known.isNonNegative();      }); @@ -3710,7 +4149,7 @@ void BoUpSLP::computeMinimumValueSizes() {      // Determine the maximum number of bits required to store the scalar      // values.      for (auto *Scalar : ToDemote) { -      auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT); +      auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);        auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());        MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);      } @@ -3755,6 +4194,7 @@ void BoUpSLP::computeMinimumValueSizes() {  }  namespace { +  /// The SLPVectorizer Pass.  struct SLPVectorizer : public FunctionPass {    SLPVectorizerPass Impl; @@ -3766,7 +4206,6 @@ struct SLPVectorizer : public FunctionPass {      initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());    } -    bool doInitialization(Module &M) override {      return false;    } @@ -3806,6 +4245,7 @@ struct SLPVectorizer : public FunctionPass {      AU.setPreservesCFG();    }  }; +  } // end anonymous namespace  PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { @@ -3893,7 +4333,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,    }    if (Changed) { -    R.optimizeGatherSequence(); +    R.optimizeGatherSequence(F);      DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");      DEBUG(verifyFunction(F));    } @@ -3952,7 +4392,9 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,      DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");      if (Cost < -SLPCostThreshold) {        DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); +        using namespace ore; +        R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",                                            cast<StoreInst>(Chain[i]))                         << "Stores SLP vectorized with cost " << NV("Cost", Cost) @@ -3972,7 +4414,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,  bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,                                          BoUpSLP &R) { -  SetVector<StoreInst *> Heads, Tails; +  SetVector<StoreInst *> Heads; +  SmallDenseSet<StoreInst *> Tails;    SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;    // We may run into multiple chains that merge into a single chain. We mark the @@ -3980,45 +4423,51 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,    BoUpSLP::ValueSet VectorizedStores;    bool Changed = false; -  // Do a quadratic search on all of the given stores and find +  // Do a quadratic search on all of the given stores in reverse order and find    // all of the pairs of stores that follow each other.    SmallVector<unsigned, 16> IndexQueue; -  for (unsigned i = 0, e = Stores.size(); i < e; ++i) { -    IndexQueue.clear(); +  unsigned E = Stores.size(); +  IndexQueue.resize(E - 1); +  for (unsigned I = E; I > 0; --I) { +    unsigned Idx = I - 1;      // If a store has multiple consecutive store candidates, search Stores -    // array according to the sequence: from i+1 to e, then from i-1 to 0. +    // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...      // This is because usually pairing with immediate succeeding or preceding      // candidate create the best chance to find slp vectorization opportunity. -    unsigned j = 0; -    for (j = i + 1; j < e; ++j) -      IndexQueue.push_back(j); -    for (j = i; j > 0; --j) -      IndexQueue.push_back(j - 1); - -    for (auto &k : IndexQueue) { -      if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) { -        Tails.insert(Stores[k]); -        Heads.insert(Stores[i]); -        ConsecutiveChain[Stores[i]] = Stores[k]; +    unsigned Offset = 1; +    unsigned Cnt = 0; +    for (unsigned J = 0; J < E - 1; ++J, ++Offset) { +      if (Idx >= Offset) { +        IndexQueue[Cnt] = Idx - Offset; +        ++Cnt; +      } +      if (Idx + Offset < E) { +        IndexQueue[Cnt] = Idx + Offset; +        ++Cnt; +      } +    } + +    for (auto K : IndexQueue) { +      if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) { +        Tails.insert(Stores[Idx]); +        Heads.insert(Stores[K]); +        ConsecutiveChain[Stores[K]] = Stores[Idx];          break;        }      }    }    // For stores that start but don't end a link in the chain: -  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); -       it != e; ++it) { -    if (Tails.count(*it)) +  for (auto *SI : llvm::reverse(Heads)) { +    if (Tails.count(SI))        continue;      // We found a store instr that starts a chain. Now follow the chain and try      // to vectorize it.      BoUpSLP::ValueList Operands; -    StoreInst *I = *it; +    StoreInst *I = SI;      // Collect the chain into a list. -    while (Tails.count(I) || Heads.count(I)) { -      if (VectorizedStores.count(I)) -        break; +    while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {        Operands.push_back(I);        // Move to the next value in the chain.        I = ConsecutiveChain[I]; @@ -4041,7 +4490,6 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,  }  void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { -    // Initialize the collections. We will make a single pass over the block.    Stores.clear();    GEPs.clear(); @@ -4050,7 +4498,6 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {    // Stores and GEPs according to the underlying objects of their pointer    // operands.    for (Instruction &I : *BB) { -      // Ignore store instructions that are volatile or have a pointer operand      // that doesn't point to a scalar type.      if (auto *SI = dyn_cast<StoreInst>(&I)) { @@ -4086,7 +4533,8 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {  bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,                                             ArrayRef<Value *> BuildVector, -                                           bool AllowReorder) { +                                           bool AllowReorder, +                                           bool NeedExtraction) {    if (VL.size() < 2)      return false; @@ -4103,19 +4551,51 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,    unsigned Sz = R.getVectorElementSize(I0);    unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);    unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); -  if (MaxVF < 2) -    return false; +  if (MaxVF < 2) { +     R.getORE()->emit([&]() { +         return OptimizationRemarkMissed( +                    SV_NAME, "SmallVF", I0) +                << "Cannot SLP vectorize list: vectorization factor " +                << "less than 2 is not supported"; +     }); +     return false; +  }    for (Value *V : VL) {      Type *Ty = V->getType(); -    if (!isValidElementType(Ty)) +    if (!isValidElementType(Ty)) { +      // NOTE: the following will give user internal llvm type name, which may not be useful +      R.getORE()->emit([&]() { +          std::string type_str; +          llvm::raw_string_ostream rso(type_str); +          Ty->print(rso); +          return OptimizationRemarkMissed( +                     SV_NAME, "UnsupportedType", I0) +                 << "Cannot SLP vectorize list: type " +                 << rso.str() + " is unsupported by vectorizer"; +      });        return false; +    }      Instruction *Inst = dyn_cast<Instruction>(V); -    if (!Inst || Inst->getOpcode() != Opcode0) + +    if (!Inst) +        return false; +    if (Inst->getOpcode() != Opcode0) { +      R.getORE()->emit([&]() { +          return OptimizationRemarkMissed( +                     SV_NAME, "InequableTypes", I0) +                 << "Cannot SLP vectorize list: not all of the " +                 << "parts of scalar instructions are of the same type: " +                 << ore::NV("Instruction1Opcode", I0) << " and " +                 << ore::NV("Instruction2Opcode", Inst); +      });        return false; +    }    }    bool Changed = false; +  bool CandidateFound = false; +  int MinCost = SLPCostThreshold;    // Keep track of values that were deleted by vectorizing in the loop below.    SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end()); @@ -4148,11 +4628,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,                     << "\n");        ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); +      ArrayRef<Value *> EmptyArray;        ArrayRef<Value *> BuildVectorSlice;        if (!BuildVector.empty())          BuildVectorSlice = BuildVector.slice(I, OpsWidth); -      R.buildTree(Ops, BuildVectorSlice); +      R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);        // TODO: check if we can allow reordering for more cases.        if (AllowReorder && R.shouldReorder()) {          // Conceptually, there is nothing actually preventing us from trying to @@ -4169,14 +4650,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,        R.computeMinimumValueSizes();        int Cost = R.getTreeCost(); +      CandidateFound = true; +      MinCost = std::min(MinCost, Cost);        if (Cost < -SLPCostThreshold) {          DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");          R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", -                                            cast<Instruction>(Ops[0])) -                         << "SLP vectorized with cost " << ore::NV("Cost", Cost) -                         << " and with tree size " -                         << ore::NV("TreeSize", R.getTreeSize())); +                                                    cast<Instruction>(Ops[0])) +                                 << "SLP vectorized with cost " << ore::NV("Cost", Cost) +                                 << " and with tree size " +                                 << ore::NV("TreeSize", R.getTreeSize()));          Value *VectorizedRoot = R.vectorizeTree(); @@ -4199,8 +4682,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,                  cast<Instruction>(Builder.CreateExtractElement(                      VectorizedRoot, Builder.getInt32(VecIdx++)));              I->setOperand(1, Extract); -            I->removeFromParent(); -            I->insertAfter(Extract); +            I->moveAfter(Extract);              InsertAfter = I;            }          } @@ -4212,18 +4694,37 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,      }    } +  if (!Changed && CandidateFound) { +    R.getORE()->emit([&]() { +        return OptimizationRemarkMissed( +                   SV_NAME, "NotBeneficial",  I0) +               << "List vectorization was possible but not beneficial with cost " +               << ore::NV("Cost", MinCost) << " >= " +               << ore::NV("Treshold", -SLPCostThreshold); +    }); +  } else if (!Changed) { +    R.getORE()->emit([&]() { +        return OptimizationRemarkMissed( +                   SV_NAME, "NotPossible", I0) +               << "Cannot SLP vectorize list: vectorization was impossible" +               << " with available vectorization factors"; +    }); +  }    return Changed;  } -bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { -  if (!V) +bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { +  if (!I)      return false; -  Value *P = V->getParent(); +  if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) +    return false; + +  Value *P = I->getParent();    // Vectorize in current basic block only. -  auto *Op0 = dyn_cast<Instruction>(V->getOperand(0)); -  auto *Op1 = dyn_cast<Instruction>(V->getOperand(1)); +  auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); +  auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));    if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)      return false; @@ -4286,6 +4787,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,  }  namespace { +  /// Model horizontal reductions.  ///  /// A horizontal reduction is a tree of reduction operations (currently add and @@ -4314,17 +4816,375 @@ namespace {  ///   *p =  ///  class HorizontalReduction { -  SmallVector<Value *, 16> ReductionOps; +  using ReductionOpsType = SmallVector<Value *, 16>; +  using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; +  ReductionOpsListType  ReductionOps;    SmallVector<Value *, 32> ReducedVals;    // Use map vector to make stable output.    MapVector<Instruction *, Value *> ExtraArgs; -  BinaryOperator *ReductionRoot = nullptr; +  /// Kind of the reduction data. +  enum ReductionKind { +    RK_None,       /// Not a reduction. +    RK_Arithmetic, /// Binary reduction data. +    RK_Min,        /// Minimum reduction data. +    RK_UMin,       /// Unsigned minimum reduction data. +    RK_Max,        /// Maximum reduction data. +    RK_UMax,       /// Unsigned maximum reduction data. +  }; + +  /// Contains info about operation, like its opcode, left and right operands. +  class OperationData { +    /// Opcode of the instruction. +    unsigned Opcode = 0; + +    /// Left operand of the reduction operation. +    Value *LHS = nullptr; + +    /// Right operand of the reduction operation. +    Value *RHS = nullptr; + +    /// Kind of the reduction operation. +    ReductionKind Kind = RK_None; + +    /// True if float point min/max reduction has no NaNs. +    bool NoNaN = false; + +    /// Checks if the reduction operation can be vectorized. +    bool isVectorizable() const { +      return LHS && RHS && +             // We currently only support adds && min/max reductions. +             ((Kind == RK_Arithmetic && +               (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) || +              ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && +               (Kind == RK_Min || Kind == RK_Max)) || +              (Opcode == Instruction::ICmp && +               (Kind == RK_UMin || Kind == RK_UMax))); +    } + +    /// Creates reduction operation with the current opcode. +    Value *createOp(IRBuilder<> &Builder, const Twine &Name) const { +      assert(isVectorizable() && +             "Expected add|fadd or min/max reduction operation."); +      Value *Cmp; +      switch (Kind) { +      case RK_Arithmetic: +        return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS, +                                   Name); +      case RK_Min: +        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS) +                                          : Builder.CreateFCmpOLT(LHS, RHS); +        break; +      case RK_Max: +        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS) +                                          : Builder.CreateFCmpOGT(LHS, RHS); +        break; +      case RK_UMin: +        assert(Opcode == Instruction::ICmp && "Expected integer types."); +        Cmp = Builder.CreateICmpULT(LHS, RHS); +        break; +      case RK_UMax: +        assert(Opcode == Instruction::ICmp && "Expected integer types."); +        Cmp = Builder.CreateICmpUGT(LHS, RHS); +        break; +      case RK_None: +        llvm_unreachable("Unknown reduction operation."); +      } +      return Builder.CreateSelect(Cmp, LHS, RHS, Name); +    } + +  public: +    explicit OperationData() = default; + +    /// Construction for reduced values. They are identified by opcode only and +    /// don't have associated LHS/RHS values. +    explicit OperationData(Value *V) { +      if (auto *I = dyn_cast<Instruction>(V)) +        Opcode = I->getOpcode(); +    } + +    /// Constructor for reduction operations with opcode and its left and +    /// right operands. +    OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind, +                  bool NoNaN = false) +        : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) { +      assert(Kind != RK_None && "One of the reduction operations is expected."); +    } + +    explicit operator bool() const { return Opcode; } + +    /// Get the index of the first operand. +    unsigned getFirstOperandIndex() const { +      assert(!!*this && "The opcode is not set."); +      switch (Kind) { +      case RK_Min: +      case RK_UMin: +      case RK_Max: +      case RK_UMax: +        return 1; +      case RK_Arithmetic: +      case RK_None: +        break; +      } +      return 0; +    } + +    /// Total number of operands in the reduction operation. +    unsigned getNumberOfOperands() const { +      assert(Kind != RK_None && !!*this && LHS && RHS && +             "Expected reduction operation."); +      switch (Kind) { +      case RK_Arithmetic: +        return 2; +      case RK_Min: +      case RK_UMin: +      case RK_Max: +      case RK_UMax: +        return 3; +      case RK_None: +        break; +      } +      llvm_unreachable("Reduction kind is not set"); +    } + +    /// Checks if the operation has the same parent as \p P. +    bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const { +      assert(Kind != RK_None && !!*this && LHS && RHS && +             "Expected reduction operation."); +      if (!IsRedOp) +        return I->getParent() == P; +      switch (Kind) { +      case RK_Arithmetic: +        // Arithmetic reduction operation must be used once only. +        return I->getParent() == P; +      case RK_Min: +      case RK_UMin: +      case RK_Max: +      case RK_UMax: { +        // SelectInst must be used twice while the condition op must have single +        // use only. +        auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); +        return I->getParent() == P && Cmp && Cmp->getParent() == P; +      } +      case RK_None: +        break; +      } +      llvm_unreachable("Reduction kind is not set"); +    } +    /// Expected number of uses for reduction operations/reduced values. +    bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const { +      assert(Kind != RK_None && !!*this && LHS && RHS && +             "Expected reduction operation."); +      switch (Kind) { +      case RK_Arithmetic: +        return I->hasOneUse(); +      case RK_Min: +      case RK_UMin: +      case RK_Max: +      case RK_UMax: +        return I->hasNUses(2) && +               (!IsReductionOp || +                cast<SelectInst>(I)->getCondition()->hasOneUse()); +      case RK_None: +        break; +      } +      llvm_unreachable("Reduction kind is not set"); +    } + +    /// Initializes the list of reduction operations. +    void initReductionOps(ReductionOpsListType &ReductionOps) { +      assert(Kind != RK_None && !!*this && LHS && RHS && +             "Expected reduction operation."); +      switch (Kind) { +      case RK_Arithmetic: +        ReductionOps.assign(1, ReductionOpsType()); +        break; +      case RK_Min: +      case RK_UMin: +      case RK_Max: +      case RK_UMax: +        ReductionOps.assign(2, ReductionOpsType()); +        break; +      case RK_None: +        llvm_unreachable("Reduction kind is not set"); +      } +    } +    /// Add all reduction operations for the reduction instruction \p I. +    void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) { +      assert(Kind != RK_None && !!*this && LHS && RHS && +             "Expected reduction operation."); +      switch (Kind) { +      case RK_Arithmetic: +        ReductionOps[0].emplace_back(I); +        break; +      case RK_Min: +      case RK_UMin: +      case RK_Max: +      case RK_UMax: +        ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); +        ReductionOps[1].emplace_back(I); +        break; +      case RK_None: +        llvm_unreachable("Reduction kind is not set"); +      } +    } + +    /// Checks if instruction is associative and can be vectorized. +    bool isAssociative(Instruction *I) const { +      assert(Kind != RK_None && *this && LHS && RHS && +             "Expected reduction operation."); +      switch (Kind) { +      case RK_Arithmetic: +        return I->isAssociative(); +      case RK_Min: +      case RK_Max: +        return Opcode == Instruction::ICmp || +               cast<Instruction>(I->getOperand(0))->isFast(); +      case RK_UMin: +      case RK_UMax: +        assert(Opcode == Instruction::ICmp && +               "Only integer compare operation is expected."); +        return true; +      case RK_None: +        break; +      } +      llvm_unreachable("Reduction kind is not set"); +    } + +    /// Checks if the reduction operation can be vectorized. +    bool isVectorizable(Instruction *I) const { +      return isVectorizable() && isAssociative(I); +    } + +    /// Checks if two operation data are both a reduction op or both a reduced +    /// value. +    bool operator==(const OperationData &OD) { +      assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && +             "One of the comparing operations is incorrect."); +      return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode); +    } +    bool operator!=(const OperationData &OD) { return !(*this == OD); } +    void clear() { +      Opcode = 0; +      LHS = nullptr; +      RHS = nullptr; +      Kind = RK_None; +      NoNaN = false; +    } + +    /// Get the opcode of the reduction operation. +    unsigned getOpcode() const { +      assert(isVectorizable() && "Expected vectorizable operation."); +      return Opcode; +    } + +    /// Get kind of reduction data. +    ReductionKind getKind() const { return Kind; } +    Value *getLHS() const { return LHS; } +    Value *getRHS() const { return RHS; } +    Type *getConditionType() const { +      switch (Kind) { +      case RK_Arithmetic: +        return nullptr; +      case RK_Min: +      case RK_Max: +      case RK_UMin: +      case RK_UMax: +        return CmpInst::makeCmpResultType(LHS->getType()); +      case RK_None: +        break; +      } +      llvm_unreachable("Reduction kind is not set"); +    } + +    /// Creates reduction operation with the current opcode with the IR flags +    /// from \p ReductionOps. +    Value *createOp(IRBuilder<> &Builder, const Twine &Name, +                    const ReductionOpsListType &ReductionOps) const { +      assert(isVectorizable() && +             "Expected add|fadd or min/max reduction operation."); +      auto *Op = createOp(Builder, Name); +      switch (Kind) { +      case RK_Arithmetic: +        propagateIRFlags(Op, ReductionOps[0]); +        return Op; +      case RK_Min: +      case RK_Max: +      case RK_UMin: +      case RK_UMax: +        if (auto *SI = dyn_cast<SelectInst>(Op)) +          propagateIRFlags(SI->getCondition(), ReductionOps[0]); +        propagateIRFlags(Op, ReductionOps[1]); +        return Op; +      case RK_None: +        break; +      } +      llvm_unreachable("Unknown reduction operation."); +    } +    /// Creates reduction operation with the current opcode with the IR flags +    /// from \p I. +    Value *createOp(IRBuilder<> &Builder, const Twine &Name, +                    Instruction *I) const { +      assert(isVectorizable() && +             "Expected add|fadd or min/max reduction operation."); +      auto *Op = createOp(Builder, Name); +      switch (Kind) { +      case RK_Arithmetic: +        propagateIRFlags(Op, I); +        return Op; +      case RK_Min: +      case RK_Max: +      case RK_UMin: +      case RK_UMax: +        if (auto *SI = dyn_cast<SelectInst>(Op)) { +          propagateIRFlags(SI->getCondition(), +                           cast<SelectInst>(I)->getCondition()); +        } +        propagateIRFlags(Op, I); +        return Op; +      case RK_None: +        break; +      } +      llvm_unreachable("Unknown reduction operation."); +    } + +    TargetTransformInfo::ReductionFlags getFlags() const { +      TargetTransformInfo::ReductionFlags Flags; +      Flags.NoNaN = NoNaN; +      switch (Kind) { +      case RK_Arithmetic: +        break; +      case RK_Min: +        Flags.IsSigned = Opcode == Instruction::ICmp; +        Flags.IsMaxOp = false; +        break; +      case RK_Max: +        Flags.IsSigned = Opcode == Instruction::ICmp; +        Flags.IsMaxOp = true; +        break; +      case RK_UMin: +        Flags.IsSigned = false; +        Flags.IsMaxOp = false; +        break; +      case RK_UMax: +        Flags.IsSigned = false; +        Flags.IsMaxOp = true; +        break; +      case RK_None: +        llvm_unreachable("Reduction kind is not set"); +      } +      return Flags; +    } +  }; + +  Instruction *ReductionRoot = nullptr; + +  /// The operation data of the reduction operation. +  OperationData ReductionData; + +  /// The operation data of the values we perform a reduction on. +  OperationData ReducedValueData; -  /// The opcode of the reduction. -  Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd; -  /// The opcode of the values we perform a reduction on. -  unsigned ReducedValueOpcode = 0;    /// Should we model this reduction as a pairwise reduction tree or a tree that    /// splits the vector in halves and adds those halves.    bool IsPairwiseReduction = false; @@ -4349,55 +5209,89 @@ class HorizontalReduction {      }    } +  static OperationData getOperationData(Value *V) { +    if (!V) +      return OperationData(); + +    Value *LHS; +    Value *RHS; +    if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) { +      return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS, +                           RK_Arithmetic); +    } +    if (auto *Select = dyn_cast<SelectInst>(V)) { +      // Look for a min/max pattern. +      if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) { +        return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin); +      } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) { +        return OperationData(Instruction::ICmp, LHS, RHS, RK_Min); +      } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) || +                 m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) { +        return OperationData( +            Instruction::FCmp, LHS, RHS, RK_Min, +            cast<Instruction>(Select->getCondition())->hasNoNaNs()); +      } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) { +        return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax); +      } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) { +        return OperationData(Instruction::ICmp, LHS, RHS, RK_Max); +      } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) || +                 m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) { +        return OperationData( +            Instruction::FCmp, LHS, RHS, RK_Max, +            cast<Instruction>(Select->getCondition())->hasNoNaNs()); +      } +    } +    return OperationData(V); +  } +  public:    HorizontalReduction() = default;    /// \brief Try to find a reduction tree. -  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { +  bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {      assert((!Phi || is_contained(Phi->operands(), B)) &&             "Thi phi needs to use the binary operator"); +    ReductionData = getOperationData(B); +      // We could have a initial reductions that is not an add.      //  r *= v1 + v2 + v3 + v4      // In such a case start looking for a tree rooted in the first '+'.      if (Phi) { -      if (B->getOperand(0) == Phi) { +      if (ReductionData.getLHS() == Phi) {          Phi = nullptr; -        B = dyn_cast<BinaryOperator>(B->getOperand(1)); -      } else if (B->getOperand(1) == Phi) { +        B = dyn_cast<Instruction>(ReductionData.getRHS()); +        ReductionData = getOperationData(B); +      } else if (ReductionData.getRHS() == Phi) {          Phi = nullptr; -        B = dyn_cast<BinaryOperator>(B->getOperand(0)); +        B = dyn_cast<Instruction>(ReductionData.getLHS()); +        ReductionData = getOperationData(B);        }      } -    if (!B) +    if (!ReductionData.isVectorizable(B))        return false;      Type *Ty = B->getType();      if (!isValidElementType(Ty))        return false; -    ReductionOpcode = B->getOpcode(); -    ReducedValueOpcode = 0; +    ReducedValueData.clear();      ReductionRoot = B; -    // We currently only support adds. -    if ((ReductionOpcode != Instruction::Add && -         ReductionOpcode != Instruction::FAdd) || -        !B->isAssociative()) -      return false; -      // Post order traverse the reduction tree starting at B. We only handle true -    // trees containing only binary operators or selects. +    // trees containing only binary operators.      SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; -    Stack.push_back(std::make_pair(B, 0)); +    Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex())); +    ReductionData.initReductionOps(ReductionOps);      while (!Stack.empty()) {        Instruction *TreeN = Stack.back().first;        unsigned EdgeToVist = Stack.back().second++; -      bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; +      OperationData OpData = getOperationData(TreeN); +      bool IsReducedValue = OpData != ReductionData;        // Postorder vist. -      if (EdgeToVist == 2 || IsReducedValue) { +      if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {          if (IsReducedValue)            ReducedVals.push_back(TreeN);          else { @@ -4415,7 +5309,7 @@ public:              markExtraArg(Stack[Stack.size() - 2], TreeN);              ExtraArgs.erase(TreeN);            } else -            ReductionOps.push_back(TreeN); +            ReductionData.addReductionOps(TreeN, ReductionOps);          }          // Retract.          Stack.pop_back(); @@ -4426,45 +5320,50 @@ public:        Value *NextV = TreeN->getOperand(EdgeToVist);        if (NextV != Phi) {          auto *I = dyn_cast<Instruction>(NextV); +        OpData = getOperationData(I);          // Continue analysis if the next operand is a reduction operation or          // (possibly) a reduced value. If the reduced value opcode is not set,          // the first met operation != reduction operation is considered as the          // reduced value class. -        if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode || -                  I->getOpcode() == ReductionOpcode)) { +        if (I && (!ReducedValueData || OpData == ReducedValueData || +                  OpData == ReductionData)) { +          const bool IsReductionOperation = OpData == ReductionData;            // Only handle trees in the current basic block. -          if (I->getParent() != B->getParent()) { +          if (!ReductionData.hasSameParent(I, B->getParent(), +                                           IsReductionOperation)) {              // I is an extra argument for TreeN (its parent operation).              markExtraArg(Stack.back(), I);              continue;            } -          // Each tree node needs to have one user except for the ultimate -          // reduction. -          if (!I->hasOneUse() && I != B) { +          // Each tree node needs to have minimal number of users except for the +          // ultimate reduction. +          if (!ReductionData.hasRequiredNumberOfUses(I, +                                                     OpData == ReductionData) && +              I != B) {              // I is an extra argument for TreeN (its parent operation).              markExtraArg(Stack.back(), I);              continue;            } -          if (I->getOpcode() == ReductionOpcode) { +          if (IsReductionOperation) {              // We need to be able to reassociate the reduction operations. -            if (!I->isAssociative()) { +            if (!OpData.isAssociative(I)) {                // I is an extra argument for TreeN (its parent operation).                markExtraArg(Stack.back(), I);                continue;              } -          } else if (ReducedValueOpcode && -                     ReducedValueOpcode != I->getOpcode()) { +          } else if (ReducedValueData && +                     ReducedValueData != OpData) {              // Make sure that the opcodes of the operations that we are going to              // reduce match.              // I is an extra argument for TreeN (its parent operation).              markExtraArg(Stack.back(), I);              continue; -          } else if (!ReducedValueOpcode) -            ReducedValueOpcode = I->getOpcode(); +          } else if (!ReducedValueData) +            ReducedValueData = OpData; -          Stack.push_back(std::make_pair(I, 0)); +          Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));            continue;          }        } @@ -4492,7 +5391,7 @@ public:      Value *VectorizedTree = nullptr;      IRBuilder<> Builder(ReductionRoot);      FastMathFlags Unsafe; -    Unsafe.setUnsafeAlgebra(); +    Unsafe.setFast();      Builder.setFastMathFlags(Unsafe);      unsigned i = 0; @@ -4501,12 +5400,15 @@ public:      // to use it.      for (auto &Pair : ExtraArgs)        ExternallyUsedValues[Pair.second].push_back(Pair.first); +    SmallVector<Value *, 16> IgnoreList; +    for (auto &V : ReductionOps) +      IgnoreList.append(V.begin(), V.end());      while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {        auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); -      V.buildTree(VL, ExternallyUsedValues, ReductionOps); +      V.buildTree(VL, ExternallyUsedValues, IgnoreList);        if (V.shouldReorder()) {          SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend()); -        V.buildTree(Reversed, ExternallyUsedValues, ReductionOps); +        V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);        }        if (V.isTreeTinyAndNotFullyVectorizable())          break; @@ -4516,17 +5418,27 @@ public:        // Estimate cost.        int Cost =            V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth); -      if (Cost >= -SLPCostThreshold) -        break; +      if (Cost >= -SLPCostThreshold) { +          V.getORE()->emit([&]() { +              return OptimizationRemarkMissed( +                         SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0])) +                     << "Vectorizing horizontal reduction is possible" +                     << "but not beneficial with cost " +                     << ore::NV("Cost", Cost) << " and threshold " +                     << ore::NV("Threshold", -SLPCostThreshold); +          }); +          break; +      }        DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost                     << ". (HorRdx)\n"); -      auto *I0 = cast<Instruction>(VL[0]); -      V.getORE()->emit( -          OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0) +      V.getORE()->emit([&]() { +          return OptimizationRemark( +                     SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))            << "Vectorized horizontal reduction with cost "            << ore::NV("Cost", Cost) << " and with tree size " -          << ore::NV("TreeSize", V.getTreeSize())); +          << ore::NV("TreeSize", V.getTreeSize()); +      });        // Vectorize a tree.        DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); @@ -4534,12 +5446,14 @@ public:        // Emit a reduction.        Value *ReducedSubTree = -          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI); +          emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);        if (VectorizedTree) {          Builder.SetCurrentDebugLocation(Loc); -        VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, -                                             ReducedSubTree, "bin.rdx"); -        propagateIRFlags(VectorizedTree, ReductionOps); +        OperationData VectReductionData(ReductionData.getOpcode(), +                                        VectorizedTree, ReducedSubTree, +                                        ReductionData.getKind()); +        VectorizedTree = +            VectReductionData.createOp(Builder, "op.rdx", ReductionOps);        } else          VectorizedTree = ReducedSubTree;        i += ReduxWidth; @@ -4551,9 +5465,10 @@ public:        for (; i < NumReducedVals; ++i) {          auto *I = cast<Instruction>(ReducedVals[i]);          Builder.SetCurrentDebugLocation(I->getDebugLoc()); -        VectorizedTree = -            Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I); -        propagateIRFlags(VectorizedTree, ReductionOps); +        OperationData VectReductionData(ReductionData.getOpcode(), +                                        VectorizedTree, I, +                                        ReductionData.getKind()); +        VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);        }        for (auto &Pair : ExternallyUsedValues) {          assert(!Pair.second.empty() && @@ -4561,9 +5476,10 @@ public:          // Add each externally used value to the final reduction.          for (auto *I : Pair.second) {            Builder.SetCurrentDebugLocation(I->getDebugLoc()); -          VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, -                                               Pair.first, "bin.extra"); -          propagateIRFlags(VectorizedTree, I); +          OperationData VectReductionData(ReductionData.getOpcode(), +                                          VectorizedTree, Pair.first, +                                          ReductionData.getKind()); +          VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);          }        }        // Update users. @@ -4583,15 +5499,58 @@ private:      Type *ScalarTy = FirstReducedVal->getType();      Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); -    int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); -    int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); +    int PairwiseRdxCost; +    int SplittingRdxCost; +    switch (ReductionData.getKind()) { +    case RK_Arithmetic: +      PairwiseRdxCost = +          TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, +                                          /*IsPairwiseForm=*/true); +      SplittingRdxCost = +          TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, +                                          /*IsPairwiseForm=*/false); +      break; +    case RK_Min: +    case RK_Max: +    case RK_UMin: +    case RK_UMax: { +      Type *VecCondTy = CmpInst::makeCmpResultType(VecTy); +      bool IsUnsigned = ReductionData.getKind() == RK_UMin || +                        ReductionData.getKind() == RK_UMax; +      PairwiseRdxCost = +          TTI->getMinMaxReductionCost(VecTy, VecCondTy, +                                      /*IsPairwiseForm=*/true, IsUnsigned); +      SplittingRdxCost = +          TTI->getMinMaxReductionCost(VecTy, VecCondTy, +                                      /*IsPairwiseForm=*/false, IsUnsigned); +      break; +    } +    case RK_None: +      llvm_unreachable("Expected arithmetic or min/max reduction operation"); +    }      IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;      int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; -    int ScalarReduxCost = -        (ReduxWidth - 1) * -        TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy); +    int ScalarReduxCost; +    switch (ReductionData.getKind()) { +    case RK_Arithmetic: +      ScalarReduxCost = +          TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy); +      break; +    case RK_Min: +    case RK_Max: +    case RK_UMin: +    case RK_UMax: +      ScalarReduxCost = +          TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) + +          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, +                                  CmpInst::makeCmpResultType(ScalarTy)); +      break; +    case RK_None: +      llvm_unreachable("Expected arithmetic or min/max reduction operation"); +    } +    ScalarReduxCost *= (ReduxWidth - 1);      DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost                   << " for reduction that starts with " << *FirstReducedVal @@ -4604,16 +5563,15 @@ private:    /// \brief Emit a horizontal reduction of the vectorized value.    Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, -                       unsigned ReduxWidth, ArrayRef<Value *> RedOps, -                       const TargetTransformInfo *TTI) { +                       unsigned ReduxWidth, const TargetTransformInfo *TTI) {      assert(VectorizedValue && "Need to have a vectorized tree node");      assert(isPowerOf2_32(ReduxWidth) &&             "We only handle power-of-two reductions for now");      if (!IsPairwiseReduction)        return createSimpleTargetReduction( -          Builder, TTI, ReductionOpcode, VectorizedValue, -          TargetTransformInfo::ReductionFlags(), RedOps); +          Builder, TTI, ReductionData.getOpcode(), VectorizedValue, +          ReductionData.getFlags(), ReductionOps.back());      Value *TmpVec = VectorizedValue;      for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { @@ -4627,15 +5585,16 @@ private:        Value *RightShuf = Builder.CreateShuffleVector(            TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),            "rdx.shuf.r"); -      TmpVec = -          Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx"); -      propagateIRFlags(TmpVec, RedOps); +      OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf, +                                      RightShuf, ReductionData.getKind()); +      TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);      }      // The result is in the first element of the vector.      return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));    }  }; +  } // end anonymous namespace  /// \brief Recognize construction of vectors like @@ -4643,39 +5602,29 @@ private:  ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1  ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2  ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3 +///  starting from the last insertelement instruction.  ///  /// Returns true if it matches -/// -static bool findBuildVector(InsertElementInst *FirstInsertElem, +static bool findBuildVector(InsertElementInst *LastInsertElem,                              SmallVectorImpl<Value *> &BuildVector,                              SmallVectorImpl<Value *> &BuildVectorOpds) { -  if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) -    return false; - -  InsertElementInst *IE = FirstInsertElem; -  while (true) { -    BuildVector.push_back(IE); -    BuildVectorOpds.push_back(IE->getOperand(1)); - -    if (IE->use_empty()) -      return false; - -    InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); -    if (!NextUse) -      return true; - -    // If this isn't the final use, make sure the next insertelement is the only -    // use. It's OK if the final constructed vector is used multiple times -    if (!IE->hasOneUse()) +  Value *V = nullptr; +  do { +    BuildVector.push_back(LastInsertElem); +    BuildVectorOpds.push_back(LastInsertElem->getOperand(1)); +    V = LastInsertElem->getOperand(0); +    if (isa<UndefValue>(V)) +      break; +    LastInsertElem = dyn_cast<InsertElementInst>(V); +    if (!LastInsertElem || !LastInsertElem->hasOneUse())        return false; - -    IE = NextUse; -  } - -  return false; +  } while (true); +  std::reverse(BuildVector.begin(), BuildVector.end()); +  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); +  return true;  } -/// \brief Like findBuildVector, but looks backwards for construction of aggregate. +/// \brief Like findBuildVector, but looks for construction of aggregate.  ///  /// \return true if it matches.  static bool findBuildAggregate(InsertValueInst *IV, @@ -4767,14 +5716,14 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,  static bool tryToVectorizeHorReductionOrInstOperands(      PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,      TargetTransformInfo *TTI, -    const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) { +    const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {    if (!ShouldVectorizeHor)      return false;    if (!Root)      return false; -  if (Root->getParent() != BB) +  if (Root->getParent() != BB || isa<PHINode>(Root))      return false;    // Start analysis starting from Root instruction. If horizontal reduction is    // found, try to vectorize it. If it is not a horizontal reduction or @@ -4795,11 +5744,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(      if (!V)        continue;      auto *Inst = dyn_cast<Instruction>(V); -    if (!Inst || isa<PHINode>(Inst)) +    if (!Inst)        continue; -    if (auto *BI = dyn_cast<BinaryOperator>(Inst)) { +    auto *BI = dyn_cast<BinaryOperator>(Inst); +    auto *SI = dyn_cast<SelectInst>(Inst); +    if (BI || SI) {        HorizontalReduction HorRdx; -      if (HorRdx.matchAssociativeReduction(P, BI)) { +      if (HorRdx.matchAssociativeReduction(P, Inst)) {          if (HorRdx.tryToReduce(R, TTI)) {            Res = true;            // Set P to nullptr to avoid re-analysis of phi node in @@ -4808,7 +5759,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(            continue;          }        } -      if (P) { +      if (P && BI) {          Inst = dyn_cast<Instruction>(BI->getOperand(0));          if (Inst == P)            Inst = dyn_cast<Instruction>(BI->getOperand(1)); @@ -4823,15 +5774,20 @@ static bool tryToVectorizeHorReductionOrInstOperands(      // Set P to nullptr to avoid re-analysis of phi node in      // matchAssociativeReduction function unless this is the root node.      P = nullptr; -    if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) { +    if (Vectorize(Inst, R)) {        Res = true;        continue;      }      // Try to vectorize operands. +    // Continue analysis for the instruction from the same basic block only to +    // save compile time.      if (++Level < RecursionMaxDepth)        for (auto *Op : Inst->operand_values()) -        Stack.emplace_back(Op, Level); +        if (VisitedInstrs.insert(Op).second) +          if (auto *I = dyn_cast<Instruction>(Op)) +            if (!isa<PHINode>(I) && I->getParent() == BB) +              Stack.emplace_back(Op, Level);    }    return Res;  } @@ -4848,10 +5804,71 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,    if (!isa<BinaryOperator>(I))      P = nullptr;    // Try to match and vectorize a horizontal reduction. -  return tryToVectorizeHorReductionOrInstOperands( -      P, I, BB, R, TTI, [this](BinaryOperator *BI, BoUpSLP &R) -> bool { -        return tryToVectorize(BI, R); -      }); +  auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { +    return tryToVectorize(I, R); +  }; +  return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, +                                                  ExtraVectorization); +} + +bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, +                                                 BasicBlock *BB, BoUpSLP &R) { +  const DataLayout &DL = BB->getModule()->getDataLayout(); +  if (!R.canMapToVector(IVI->getType(), DL)) +    return false; + +  SmallVector<Value *, 16> BuildVector; +  SmallVector<Value *, 16> BuildVectorOpds; +  if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds)) +    return false; + +  DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); +  // Aggregate value is unlikely to be processed in vector register, we need to +  // extract scalars into scalar registers, so NeedExtraction is set true. +  return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true); +} + +bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, +                                                   BasicBlock *BB, BoUpSLP &R) { +  SmallVector<Value *, 16> BuildVector; +  SmallVector<Value *, 16> BuildVectorOpds; +  if (!findBuildVector(IEI, BuildVector, BuildVectorOpds)) +    return false; + +  // Vectorize starting with the build vector operands ignoring the BuildVector +  // instructions for the purpose of scheduling and user extraction. +  return tryToVectorizeList(BuildVectorOpds, R, BuildVector); +} + +bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, +                                         BoUpSLP &R) { +  if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) +    return true; + +  bool OpsChanged = false; +  for (int Idx = 0; Idx < 2; ++Idx) { +    OpsChanged |= +        vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI); +  } +  return OpsChanged; +} + +bool SLPVectorizerPass::vectorizeSimpleInstructions( +    SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) { +  bool OpsChanged = false; +  for (auto &VH : reverse(Instructions)) { +    auto *I = dyn_cast_or_null<Instruction>(VH); +    if (!I) +      continue; +    if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) +      OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); +    else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) +      OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); +    else if (auto *CI = dyn_cast<CmpInst>(I)) +      OpsChanged |= vectorizeCmpInst(CI, BB, R); +  } +  Instructions.clear(); +  return OpsChanged;  }  bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { @@ -4913,10 +5930,21 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {    VisitedInstrs.clear(); +  SmallVector<WeakVH, 8> PostProcessInstructions; +  SmallDenseSet<Instruction *, 4> KeyNodes;    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {      // We may go through BB multiple times so skip the one we have checked. -    if (!VisitedInstrs.insert(&*it).second) +    if (!VisitedInstrs.insert(&*it).second) { +      if (it->use_empty() && KeyNodes.count(&*it) > 0 && +          vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) { +        // We would like to start over since some instructions are deleted +        // and the iterator may become invalid value. +        Changed = true; +        it = BB->begin(); +        e = BB->end(); +      }        continue; +    }      if (isa<DbgInfoIntrinsic>(it))        continue; @@ -4938,96 +5966,37 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {        continue;      } -    if (ShouldStartVectorizeHorAtStore) { -      if (StoreInst *SI = dyn_cast<StoreInst>(it)) { -        // Try to match and vectorize a horizontal reduction. -        if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R, -                                     TTI)) { -          Changed = true; -          it = BB->begin(); -          e = BB->end(); -          continue; +    // Ran into an instruction without users, like terminator, or function call +    // with ignored return value, store. Ignore unused instructions (basing on +    // instruction type, except for CallInst and InvokeInst). +    if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) || +                            isa<InvokeInst>(it))) { +      KeyNodes.insert(&*it); +      bool OpsChanged = false; +      if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) { +        for (auto *V : it->operand_values()) { +          // Try to match and vectorize a horizontal reduction. +          OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);          }        } -    } - -    // Try to vectorize horizontal reductions feeding into a return. -    if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) { -      if (RI->getNumOperands() != 0) { -        // Try to match and vectorize a horizontal reduction. -        if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) { -          Changed = true; -          it = BB->begin(); -          e = BB->end(); -          continue; -        } -      } -    } - -    // Try to vectorize trees that start at compare instructions. -    if (CmpInst *CI = dyn_cast<CmpInst>(it)) { -      if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { -        Changed = true; +      // Start vectorization of post-process list of instructions from the +      // top-tree instructions to try to vectorize as many instructions as +      // possible. +      OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R); +      if (OpsChanged) {          // We would like to start over since some instructions are deleted          // and the iterator may become invalid value. -        it = BB->begin(); -        e = BB->end(); -        continue; -      } - -      for (int I = 0; I < 2; ++I) { -        if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) { -          Changed = true; -          // We would like to start over since some instructions are deleted -          // and the iterator may become invalid value. -          it = BB->begin(); -          e = BB->end(); -          break; -        } -      } -      continue; -    } - -    // Try to vectorize trees that start at insertelement instructions. -    if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { -      SmallVector<Value *, 16> BuildVector; -      SmallVector<Value *, 16> BuildVectorOpds; -      if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) -        continue; - -      // Vectorize starting with the build vector operands ignoring the -      // BuildVector instructions for the purpose of scheduling and user -      // extraction. -      if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {          Changed = true;          it = BB->begin();          e = BB->end(); +        continue;        } - -      continue;      } -    // Try to vectorize trees that start at insertvalue instructions feeding into -    // a store. -    if (StoreInst *SI = dyn_cast<StoreInst>(it)) { -      if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) { -        const DataLayout &DL = BB->getModule()->getDataLayout(); -        if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) { -          SmallVector<Value *, 16> BuildVector; -          SmallVector<Value *, 16> BuildVectorOpds; -          if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds)) -            continue; +    if (isa<InsertElementInst>(it) || isa<CmpInst>(it) || +        isa<InsertValueInst>(it)) +      PostProcessInstructions.push_back(&*it); -          DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n"); -          if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) { -            Changed = true; -            it = BB->begin(); -            e = BB->end(); -          } -          continue; -        } -      } -    }    }    return Changed; @@ -5036,7 +6005,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {  bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {    auto Changed = false;    for (auto &Entry : GEPs) { -      // If the getelementptr list has fewer than two elements, there's nothing      // to do.      if (Entry.second.size() < 2) @@ -5141,7 +6109,9 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {  }  char SLPVectorizer::ID = 0; +  static const char lv_name[] = "SLP Vectorizer"; +  INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)  INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) @@ -5152,6 +6122,4 @@ INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)  INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)  INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) -namespace llvm { -Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } -} +Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); } diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp new file mode 100644 index 000000000000..4e54fc6db2a5 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlan.cpp @@ -0,0 +1,557 @@ +//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This is the LLVM vectorization plan. It represents a candidate for +/// vectorization, allowing to plan and optimize how to vectorize a given loop +/// before generating LLVM-IR. +/// The vectorizer uses vectorization plans to estimate the costs of potential +/// candidates and if profitable to execute the desired plan, generating vector +/// LLVM-IR code. +/// +//===----------------------------------------------------------------------===// + +#include "VPlan.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <cassert> +#include <iterator> +#include <string> +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "vplan" + +raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { +  if (const VPInstruction *Instr = dyn_cast<VPInstruction>(&V)) +    Instr->print(OS); +  else +    V.printAsOperand(OS); +  return OS; +} + +/// \return the VPBasicBlock that is the entry of Block, possibly indirectly. +const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const { +  const VPBlockBase *Block = this; +  while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) +    Block = Region->getEntry(); +  return cast<VPBasicBlock>(Block); +} + +VPBasicBlock *VPBlockBase::getEntryBasicBlock() { +  VPBlockBase *Block = this; +  while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) +    Block = Region->getEntry(); +  return cast<VPBasicBlock>(Block); +} + +/// \return the VPBasicBlock that is the exit of Block, possibly indirectly. +const VPBasicBlock *VPBlockBase::getExitBasicBlock() const { +  const VPBlockBase *Block = this; +  while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) +    Block = Region->getExit(); +  return cast<VPBasicBlock>(Block); +} + +VPBasicBlock *VPBlockBase::getExitBasicBlock() { +  VPBlockBase *Block = this; +  while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) +    Block = Region->getExit(); +  return cast<VPBasicBlock>(Block); +} + +VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() { +  if (!Successors.empty() || !Parent) +    return this; +  assert(Parent->getExit() == this && +         "Block w/o successors not the exit of its parent."); +  return Parent->getEnclosingBlockWithSuccessors(); +} + +VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { +  if (!Predecessors.empty() || !Parent) +    return this; +  assert(Parent->getEntry() == this && +         "Block w/o predecessors not the entry of its parent."); +  return Parent->getEnclosingBlockWithPredecessors(); +} + +void VPBlockBase::deleteCFG(VPBlockBase *Entry) { +  SmallVector<VPBlockBase *, 8> Blocks; +  for (VPBlockBase *Block : depth_first(Entry)) +    Blocks.push_back(Block); + +  for (VPBlockBase *Block : Blocks) +    delete Block; +} + +BasicBlock * +VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { +  // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. +  // Pred stands for Predessor. Prev stands for Previous - last visited/created. +  BasicBlock *PrevBB = CFG.PrevBB; +  BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(), +                                         PrevBB->getParent(), CFG.LastBB); +  DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); + +  // Hook up the new basic block to its predecessors. +  for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { +    VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); +    auto &PredVPSuccessors = PredVPBB->getSuccessors(); +    BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; +    assert(PredBB && "Predecessor basic-block not found building successor."); +    auto *PredBBTerminator = PredBB->getTerminator(); +    DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); +    if (isa<UnreachableInst>(PredBBTerminator)) { +      assert(PredVPSuccessors.size() == 1 && +             "Predecessor ending w/o branch must have single successor."); +      PredBBTerminator->eraseFromParent(); +      BranchInst::Create(NewBB, PredBB); +    } else { +      assert(PredVPSuccessors.size() == 2 && +             "Predecessor ending with branch must have two successors."); +      unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; +      assert(!PredBBTerminator->getSuccessor(idx) && +             "Trying to reset an existing successor block."); +      PredBBTerminator->setSuccessor(idx, NewBB); +    } +  } +  return NewBB; +} + +void VPBasicBlock::execute(VPTransformState *State) { +  bool Replica = State->Instance && +                 !(State->Instance->Part == 0 && State->Instance->Lane == 0); +  VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB; +  VPBlockBase *SingleHPred = nullptr; +  BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. + +  // 1. Create an IR basic block, or reuse the last one if possible. +  // The last IR basic block is reused, as an optimization, in three cases: +  // A. the first VPBB reuses the loop header BB - when PrevVPBB is null; +  // B. when the current VPBB has a single (hierarchical) predecessor which +  //    is PrevVPBB and the latter has a single (hierarchical) successor; and +  // C. when the current VPBB is an entry of a region replica - where PrevVPBB +  //    is the exit of this region from a previous instance, or the predecessor +  //    of this region. +  if (PrevVPBB && /* A */ +      !((SingleHPred = getSingleHierarchicalPredecessor()) && +        SingleHPred->getExitBasicBlock() == PrevVPBB && +        PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */ +      !(Replica && getPredecessors().empty())) {       /* C */ +    NewBB = createEmptyBasicBlock(State->CFG); +    State->Builder.SetInsertPoint(NewBB); +    // Temporarily terminate with unreachable until CFG is rewired. +    UnreachableInst *Terminator = State->Builder.CreateUnreachable(); +    State->Builder.SetInsertPoint(Terminator); +    // Register NewBB in its loop. In innermost loops its the same for all BB's. +    Loop *L = State->LI->getLoopFor(State->CFG.LastBB); +    L->addBasicBlockToLoop(NewBB, *State->LI); +    State->CFG.PrevBB = NewBB; +  } + +  // 2. Fill the IR basic block with IR instructions. +  DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName() +               << " in BB:" << NewBB->getName() << '\n'); + +  State->CFG.VPBB2IRBB[this] = NewBB; +  State->CFG.PrevVPBB = this; + +  for (VPRecipeBase &Recipe : Recipes) +    Recipe.execute(*State); + +  DEBUG(dbgs() << "LV: filled BB:" << *NewBB); +} + +void VPRegionBlock::execute(VPTransformState *State) { +  ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry); + +  if (!isReplicator()) { +    // Visit the VPBlocks connected to "this", starting from it. +    for (VPBlockBase *Block : RPOT) { +      DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); +      Block->execute(State); +    } +    return; +  } + +  assert(!State->Instance && "Replicating a Region with non-null instance."); + +  // Enter replicating mode. +  State->Instance = {0, 0}; + +  for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { +    State->Instance->Part = Part; +    for (unsigned Lane = 0, VF = State->VF; Lane < VF; ++Lane) { +      State->Instance->Lane = Lane; +      // Visit the VPBlocks connected to \p this, starting from it. +      for (VPBlockBase *Block : RPOT) { +        DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); +        Block->execute(State); +      } +    } +  } + +  // Exit replicating mode. +  State->Instance.reset(); +} + +void VPInstruction::generateInstruction(VPTransformState &State, +                                        unsigned Part) { +  IRBuilder<> &Builder = State.Builder; + +  if (Instruction::isBinaryOp(getOpcode())) { +    Value *A = State.get(getOperand(0), Part); +    Value *B = State.get(getOperand(1), Part); +    Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); +    State.set(this, V, Part); +    return; +  } + +  switch (getOpcode()) { +  case VPInstruction::Not: { +    Value *A = State.get(getOperand(0), Part); +    Value *V = Builder.CreateNot(A); +    State.set(this, V, Part); +    break; +  } +  default: +    llvm_unreachable("Unsupported opcode for instruction"); +  } +} + +void VPInstruction::execute(VPTransformState &State) { +  assert(!State.Instance && "VPInstruction executing an Instance"); +  for (unsigned Part = 0; Part < State.UF; ++Part) +    generateInstruction(State, Part); +} + +void VPInstruction::print(raw_ostream &O, const Twine &Indent) const { +  O << " +\n" << Indent << "\"EMIT "; +  print(O); +  O << "\\l\""; +} + +void VPInstruction::print(raw_ostream &O) const { +  printAsOperand(O); +  O << " = "; + +  switch (getOpcode()) { +  case VPInstruction::Not: +    O << "not"; +    break; +  default: +    O << Instruction::getOpcodeName(getOpcode()); +  } + +  for (const VPValue *Operand : operands()) { +    O << " "; +    Operand->printAsOperand(O); +  } +} + +/// Generate the code inside the body of the vectorized loop. Assumes a single +/// LoopVectorBody basic-block was created for this. Introduce additional +/// basic-blocks as needed, and fill them all. +void VPlan::execute(VPTransformState *State) { +  // 0. Set the reverse mapping from VPValues to Values for code generation. +  for (auto &Entry : Value2VPValue) +    State->VPValue2Value[Entry.second] = Entry.first; + +  BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; +  BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); +  assert(VectorHeaderBB && "Loop preheader does not have a single successor."); +  BasicBlock *VectorLatchBB = VectorHeaderBB; + +  // 1. Make room to generate basic-blocks inside loop body if needed. +  VectorLatchBB = VectorHeaderBB->splitBasicBlock( +      VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); +  Loop *L = State->LI->getLoopFor(VectorHeaderBB); +  L->addBasicBlockToLoop(VectorLatchBB, *State->LI); +  // Remove the edge between Header and Latch to allow other connections. +  // Temporarily terminate with unreachable until CFG is rewired. +  // Note: this asserts the generated code's assumption that +  // getFirstInsertionPt() can be dereferenced into an Instruction. +  VectorHeaderBB->getTerminator()->eraseFromParent(); +  State->Builder.SetInsertPoint(VectorHeaderBB); +  UnreachableInst *Terminator = State->Builder.CreateUnreachable(); +  State->Builder.SetInsertPoint(Terminator); + +  // 2. Generate code in loop body. +  State->CFG.PrevVPBB = nullptr; +  State->CFG.PrevBB = VectorHeaderBB; +  State->CFG.LastBB = VectorLatchBB; + +  for (VPBlockBase *Block : depth_first(Entry)) +    Block->execute(State); + +  // 3. Merge the temporary latch created with the last basic-block filled. +  BasicBlock *LastBB = State->CFG.PrevBB; +  // Connect LastBB to VectorLatchBB to facilitate their merge. +  assert(isa<UnreachableInst>(LastBB->getTerminator()) && +         "Expected VPlan CFG to terminate with unreachable"); +  LastBB->getTerminator()->eraseFromParent(); +  BranchInst::Create(VectorLatchBB, LastBB); + +  // Merge LastBB with Latch. +  bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI); +  (void)Merged; +  assert(Merged && "Could not merge last basic block with latch."); +  VectorLatchBB = LastBB; + +  updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); +} + +void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, +                                BasicBlock *LoopLatchBB) { +  BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); +  assert(LoopHeaderBB && "Loop preheader does not have a single successor."); +  DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB); +  // The vector body may be more than a single basic-block by this point. +  // Update the dominator tree information inside the vector body by propagating +  // it from header to latch, expecting only triangular control-flow, if any. +  BasicBlock *PostDomSucc = nullptr; +  for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) { +    // Get the list of successors of this block. +    std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB)); +    assert(Succs.size() <= 2 && +           "Basic block in vector loop has more than 2 successors."); +    PostDomSucc = Succs[0]; +    if (Succs.size() == 1) { +      assert(PostDomSucc->getSinglePredecessor() && +             "PostDom successor has more than one predecessor."); +      DT->addNewBlock(PostDomSucc, BB); +      continue; +    } +    BasicBlock *InterimSucc = Succs[1]; +    if (PostDomSucc->getSingleSuccessor() == InterimSucc) { +      PostDomSucc = Succs[1]; +      InterimSucc = Succs[0]; +    } +    assert(InterimSucc->getSingleSuccessor() == PostDomSucc && +           "One successor of a basic block does not lead to the other."); +    assert(InterimSucc->getSinglePredecessor() && +           "Interim successor has more than one predecessor."); +    assert(std::distance(pred_begin(PostDomSucc), pred_end(PostDomSucc)) == 2 && +           "PostDom successor has more than two predecessors."); +    DT->addNewBlock(InterimSucc, BB); +    DT->addNewBlock(PostDomSucc, BB); +  } +} + +const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { +  return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") + +         Twine(getOrCreateBID(Block)); +} + +const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) { +  const std::string &Name = Block->getName(); +  if (!Name.empty()) +    return Name; +  return "VPB" + Twine(getOrCreateBID(Block)); +} + +void VPlanPrinter::dump() { +  Depth = 1; +  bumpIndent(0); +  OS << "digraph VPlan {\n"; +  OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; +  if (!Plan.getName().empty()) +    OS << "\\n" << DOT::EscapeString(Plan.getName()); +  if (!Plan.Value2VPValue.empty()) { +    OS << ", where:"; +    for (auto Entry : Plan.Value2VPValue) { +      OS << "\\n" << *Entry.second; +      OS << DOT::EscapeString(" := "); +      Entry.first->printAsOperand(OS, false); +    } +  } +  OS << "\"]\n"; +  OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; +  OS << "edge [fontname=Courier, fontsize=30]\n"; +  OS << "compound=true\n"; + +  for (VPBlockBase *Block : depth_first(Plan.getEntry())) +    dumpBlock(Block); + +  OS << "}\n"; +} + +void VPlanPrinter::dumpBlock(const VPBlockBase *Block) { +  if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block)) +    dumpBasicBlock(BasicBlock); +  else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) +    dumpRegion(Region); +  else +    llvm_unreachable("Unsupported kind of VPBlock."); +} + +void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To, +                            bool Hidden, const Twine &Label) { +  // Due to "dot" we print an edge between two regions as an edge between the +  // exit basic block and the entry basic of the respective regions. +  const VPBlockBase *Tail = From->getExitBasicBlock(); +  const VPBlockBase *Head = To->getEntryBasicBlock(); +  OS << Indent << getUID(Tail) << " -> " << getUID(Head); +  OS << " [ label=\"" << Label << '\"'; +  if (Tail != From) +    OS << " ltail=" << getUID(From); +  if (Head != To) +    OS << " lhead=" << getUID(To); +  if (Hidden) +    OS << "; splines=none"; +  OS << "]\n"; +} + +void VPlanPrinter::dumpEdges(const VPBlockBase *Block) { +  auto &Successors = Block->getSuccessors(); +  if (Successors.size() == 1) +    drawEdge(Block, Successors.front(), false, ""); +  else if (Successors.size() == 2) { +    drawEdge(Block, Successors.front(), false, "T"); +    drawEdge(Block, Successors.back(), false, "F"); +  } else { +    unsigned SuccessorNumber = 0; +    for (auto *Successor : Successors) +      drawEdge(Block, Successor, false, Twine(SuccessorNumber++)); +  } +} + +void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { +  OS << Indent << getUID(BasicBlock) << " [label =\n"; +  bumpIndent(1); +  OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; +  bumpIndent(1); +  for (const VPRecipeBase &Recipe : *BasicBlock) +    Recipe.print(OS, Indent); +  bumpIndent(-2); +  OS << "\n" << Indent << "]\n"; +  dumpEdges(BasicBlock); +} + +void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { +  OS << Indent << "subgraph " << getUID(Region) << " {\n"; +  bumpIndent(1); +  OS << Indent << "fontname=Courier\n" +     << Indent << "label=\"" +     << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ") +     << DOT::EscapeString(Region->getName()) << "\"\n"; +  // Dump the blocks of the region. +  assert(Region->getEntry() && "Region contains no inner blocks."); +  for (const VPBlockBase *Block : depth_first(Region->getEntry())) +    dumpBlock(Block); +  bumpIndent(-1); +  OS << Indent << "}\n"; +  dumpEdges(Region); +} + +void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) { +  std::string IngredientString; +  raw_string_ostream RSO(IngredientString); +  if (auto *Inst = dyn_cast<Instruction>(V)) { +    if (!Inst->getType()->isVoidTy()) { +      Inst->printAsOperand(RSO, false); +      RSO << " = "; +    } +    RSO << Inst->getOpcodeName() << " "; +    unsigned E = Inst->getNumOperands(); +    if (E > 0) { +      Inst->getOperand(0)->printAsOperand(RSO, false); +      for (unsigned I = 1; I < E; ++I) +        Inst->getOperand(I)->printAsOperand(RSO << ", ", false); +    } +  } else // !Inst +    V->printAsOperand(RSO, false); +  RSO.flush(); +  O << DOT::EscapeString(IngredientString); +} + +void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent) const { +  O << " +\n" << Indent << "\"WIDEN\\l\""; +  for (auto &Instr : make_range(Begin, End)) +    O << " +\n" << Indent << "\"  " << VPlanIngredient(&Instr) << "\\l\""; +} + +void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, +                                          const Twine &Indent) const { +  O << " +\n" << Indent << "\"WIDEN-INDUCTION"; +  if (Trunc) { +    O << "\\l\""; +    O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\""; +    O << " +\n" << Indent << "\"  " << VPlanIngredient(Trunc) << "\\l\""; +  } else +    O << " " << VPlanIngredient(IV) << "\\l\""; +} + +void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { +  O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; +} + +void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent) const { +  O << " +\n" << Indent << "\"BLEND "; +  Phi->printAsOperand(O, false); +  O << " ="; +  if (!User) { +    // Not a User of any mask: not really blending, this is a +    // single-predecessor phi. +    O << " "; +    Phi->getIncomingValue(0)->printAsOperand(O, false); +  } else { +    for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) { +      O << " "; +      Phi->getIncomingValue(I)->printAsOperand(O, false); +      O << "/"; +      User->getOperand(I)->printAsOperand(O); +    } +  } +  O << "\\l\""; +} + +void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent) const { +  O << " +\n" +    << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") +    << VPlanIngredient(Ingredient); +  if (AlsoPack) +    O << " (S->V)"; +  O << "\\l\""; +} + +void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { +  O << " +\n" +    << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst) +    << "\\l\""; +} + +void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, +                                           const Twine &Indent) const { +  O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr); +  if (User) { +    O << ", "; +    User->getOperand(0)->printAsOperand(O); +  } +  O << "\\l\""; +} diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h new file mode 100644 index 000000000000..2ccabfd6af25 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlan.h @@ -0,0 +1,1194 @@ +//===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file contains the declarations of the Vectorization Plan base classes: +/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual +///    VPBlockBase, together implementing a Hierarchical CFG; +/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be +///    treated as proper graphs for generic algorithms; +/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained +///    within VPBasicBlocks; +/// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned +///    instruction; +/// 5. The VPlan class holding a candidate for vectorization; +/// 6. The VPlanPrinter class providing a way to print a plan in dot format; +/// These are documented in docs/VectorizationPlan.rst. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H + +#include "VPlanValue.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/ilist.h" +#include "llvm/ADT/ilist_node.h" +#include "llvm/IR/IRBuilder.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <map> +#include <string> + +// The (re)use of existing LoopVectorize classes is subject to future VPlan +// refactoring. +namespace { +class LoopVectorizationLegality; +class LoopVectorizationCostModel; +} // namespace + +namespace llvm { + +class BasicBlock; +class DominatorTree; +class InnerLoopVectorizer; +class InterleaveGroup; +class LoopInfo; +class raw_ostream; +class Value; +class VPBasicBlock; +class VPRegionBlock; + +/// In what follows, the term "input IR" refers to code that is fed into the +/// vectorizer whereas the term "output IR" refers to code that is generated by +/// the vectorizer. + +/// VPIteration represents a single point in the iteration space of the output +/// (vectorized and/or unrolled) IR loop. +struct VPIteration { +  /// in [0..UF) +  unsigned Part; + +  /// in [0..VF) +  unsigned Lane; +}; + +/// This is a helper struct for maintaining vectorization state. It's used for +/// mapping values from the original loop to their corresponding values in +/// the new loop. Two mappings are maintained: one for vectorized values and +/// one for scalarized values. Vectorized values are represented with UF +/// vector values in the new loop, and scalarized values are represented with +/// UF x VF scalar values in the new loop. UF and VF are the unroll and +/// vectorization factors, respectively. +/// +/// Entries can be added to either map with setVectorValue and setScalarValue, +/// which assert that an entry was not already added before. If an entry is to +/// replace an existing one, call resetVectorValue and resetScalarValue. This is +/// currently needed to modify the mapped values during "fix-up" operations that +/// occur once the first phase of widening is complete. These operations include +/// type truncation and the second phase of recurrence widening. +/// +/// Entries from either map can be retrieved using the getVectorValue and +/// getScalarValue functions, which assert that the desired value exists. +struct VectorizerValueMap { +  friend struct VPTransformState; + +private: +  /// The unroll factor. Each entry in the vector map contains UF vector values. +  unsigned UF; + +  /// The vectorization factor. Each entry in the scalar map contains UF x VF +  /// scalar values. +  unsigned VF; + +  /// The vector and scalar map storage. We use std::map and not DenseMap +  /// because insertions to DenseMap invalidate its iterators. +  using VectorParts = SmallVector<Value *, 2>; +  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; +  std::map<Value *, VectorParts> VectorMapStorage; +  std::map<Value *, ScalarParts> ScalarMapStorage; + +public: +  /// Construct an empty map with the given unroll and vectorization factors. +  VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} + +  /// \return True if the map has any vector entry for \p Key. +  bool hasAnyVectorValue(Value *Key) const { +    return VectorMapStorage.count(Key); +  } + +  /// \return True if the map has a vector entry for \p Key and \p Part. +  bool hasVectorValue(Value *Key, unsigned Part) const { +    assert(Part < UF && "Queried Vector Part is too large."); +    if (!hasAnyVectorValue(Key)) +      return false; +    const VectorParts &Entry = VectorMapStorage.find(Key)->second; +    assert(Entry.size() == UF && "VectorParts has wrong dimensions."); +    return Entry[Part] != nullptr; +  } + +  /// \return True if the map has any scalar entry for \p Key. +  bool hasAnyScalarValue(Value *Key) const { +    return ScalarMapStorage.count(Key); +  } + +  /// \return True if the map has a scalar entry for \p Key and \p Instance. +  bool hasScalarValue(Value *Key, const VPIteration &Instance) const { +    assert(Instance.Part < UF && "Queried Scalar Part is too large."); +    assert(Instance.Lane < VF && "Queried Scalar Lane is too large."); +    if (!hasAnyScalarValue(Key)) +      return false; +    const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; +    assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); +    assert(Entry[Instance.Part].size() == VF && +           "ScalarParts has wrong dimensions."); +    return Entry[Instance.Part][Instance.Lane] != nullptr; +  } + +  /// Retrieve the existing vector value that corresponds to \p Key and +  /// \p Part. +  Value *getVectorValue(Value *Key, unsigned Part) { +    assert(hasVectorValue(Key, Part) && "Getting non-existent value."); +    return VectorMapStorage[Key][Part]; +  } + +  /// Retrieve the existing scalar value that corresponds to \p Key and +  /// \p Instance. +  Value *getScalarValue(Value *Key, const VPIteration &Instance) { +    assert(hasScalarValue(Key, Instance) && "Getting non-existent value."); +    return ScalarMapStorage[Key][Instance.Part][Instance.Lane]; +  } + +  /// Set a vector value associated with \p Key and \p Part. Assumes such a +  /// value is not already set. If it is, use resetVectorValue() instead. +  void setVectorValue(Value *Key, unsigned Part, Value *Vector) { +    assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); +    if (!VectorMapStorage.count(Key)) { +      VectorParts Entry(UF); +      VectorMapStorage[Key] = Entry; +    } +    VectorMapStorage[Key][Part] = Vector; +  } + +  /// Set a scalar value associated with \p Key and \p Instance. Assumes such a +  /// value is not already set. +  void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) { +    assert(!hasScalarValue(Key, Instance) && "Scalar value already set"); +    if (!ScalarMapStorage.count(Key)) { +      ScalarParts Entry(UF); +      // TODO: Consider storing uniform values only per-part, as they occupy +      //       lane 0 only, keeping the other VF-1 redundant entries null. +      for (unsigned Part = 0; Part < UF; ++Part) +        Entry[Part].resize(VF, nullptr); +      ScalarMapStorage[Key] = Entry; +    } +    ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; +  } + +  /// Reset the vector value associated with \p Key for the given \p Part. +  /// This function can be used to update values that have already been +  /// vectorized. This is the case for "fix-up" operations including type +  /// truncation and the second phase of recurrence vectorization. +  void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { +    assert(hasVectorValue(Key, Part) && "Vector value not set for part"); +    VectorMapStorage[Key][Part] = Vector; +  } + +  /// Reset the scalar value associated with \p Key for \p Part and \p Lane. +  /// This function can be used to update values that have already been +  /// scalarized. This is the case for "fix-up" operations including scalar phi +  /// nodes for scalarized and predicated instructions. +  void resetScalarValue(Value *Key, const VPIteration &Instance, +                        Value *Scalar) { +    assert(hasScalarValue(Key, Instance) && +           "Scalar value not set for part and lane"); +    ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; +  } +}; + +/// This class is used to enable the VPlan to invoke a method of ILV. This is +/// needed until the method is refactored out of ILV and becomes reusable. +struct VPCallback { +  virtual ~VPCallback() {} +  virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0; +}; + +/// VPTransformState holds information passed down when "executing" a VPlan, +/// needed for generating the output IR. +struct VPTransformState { +  VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, +                   IRBuilder<> &Builder, VectorizerValueMap &ValueMap, +                   InnerLoopVectorizer *ILV, VPCallback &Callback) +      : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), +        ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} + +  /// The chosen Vectorization and Unroll Factors of the loop being vectorized. +  unsigned VF; +  unsigned UF; + +  /// Hold the indices to generate specific scalar instructions. Null indicates +  /// that all instances are to be generated, using either scalar or vector +  /// instructions. +  Optional<VPIteration> Instance; + +  struct DataState { +    /// A type for vectorized values in the new loop. Each value from the +    /// original loop, when vectorized, is represented by UF vector values in +    /// the new unrolled loop, where UF is the unroll factor. +    typedef SmallVector<Value *, 2> PerPartValuesTy; + +    DenseMap<VPValue *, PerPartValuesTy> PerPartOutput; +  } Data; + +  /// Get the generated Value for a given VPValue and a given Part. Note that +  /// as some Defs are still created by ILV and managed in its ValueMap, this +  /// method will delegate the call to ILV in such cases in order to provide +  /// callers a consistent API. +  /// \see set. +  Value *get(VPValue *Def, unsigned Part) { +    // If Values have been set for this Def return the one relevant for \p Part. +    if (Data.PerPartOutput.count(Def)) +      return Data.PerPartOutput[Def][Part]; +    // Def is managed by ILV: bring the Values from ValueMap. +    return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); +  } + +  /// Set the generated Value for a given VPValue and a given Part. +  void set(VPValue *Def, Value *V, unsigned Part) { +    if (!Data.PerPartOutput.count(Def)) { +      DataState::PerPartValuesTy Entry(UF); +      Data.PerPartOutput[Def] = Entry; +    } +    Data.PerPartOutput[Def][Part] = V; +  } + +  /// Hold state information used when constructing the CFG of the output IR, +  /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. +  struct CFGState { +    /// The previous VPBasicBlock visited. Initially set to null. +    VPBasicBlock *PrevVPBB = nullptr; + +    /// The previous IR BasicBlock created or used. Initially set to the new +    /// header BasicBlock. +    BasicBlock *PrevBB = nullptr; + +    /// The last IR BasicBlock in the output IR. Set to the new latch +    /// BasicBlock, used for placing the newly created BasicBlocks. +    BasicBlock *LastBB = nullptr; + +    /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case +    /// of replication, maps the BasicBlock of the last replica created. +    SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB; + +    CFGState() = default; +  } CFG; + +  /// Hold a pointer to LoopInfo to register new basic blocks in the loop. +  LoopInfo *LI; + +  /// Hold a pointer to Dominator Tree to register new basic blocks in the loop. +  DominatorTree *DT; + +  /// Hold a reference to the IRBuilder used to generate output IR code. +  IRBuilder<> &Builder; + +  /// Hold a reference to the Value state information used when generating the +  /// Values of the output IR. +  VectorizerValueMap &ValueMap; + +  /// Hold a reference to a mapping between VPValues in VPlan and original +  /// Values they correspond to. +  VPValue2ValueTy VPValue2Value; + +  /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. +  InnerLoopVectorizer *ILV; + +  VPCallback &Callback; +}; + +/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. +/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock. +class VPBlockBase { +private: +  const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). + +  /// An optional name for the block. +  std::string Name; + +  /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if +  /// it is a topmost VPBlockBase. +  VPRegionBlock *Parent = nullptr; + +  /// List of predecessor blocks. +  SmallVector<VPBlockBase *, 1> Predecessors; + +  /// List of successor blocks. +  SmallVector<VPBlockBase *, 1> Successors; + +  /// Add \p Successor as the last successor to this block. +  void appendSuccessor(VPBlockBase *Successor) { +    assert(Successor && "Cannot add nullptr successor!"); +    Successors.push_back(Successor); +  } + +  /// Add \p Predecessor as the last predecessor to this block. +  void appendPredecessor(VPBlockBase *Predecessor) { +    assert(Predecessor && "Cannot add nullptr predecessor!"); +    Predecessors.push_back(Predecessor); +  } + +  /// Remove \p Predecessor from the predecessors of this block. +  void removePredecessor(VPBlockBase *Predecessor) { +    auto Pos = std::find(Predecessors.begin(), Predecessors.end(), Predecessor); +    assert(Pos && "Predecessor does not exist"); +    Predecessors.erase(Pos); +  } + +  /// Remove \p Successor from the successors of this block. +  void removeSuccessor(VPBlockBase *Successor) { +    auto Pos = std::find(Successors.begin(), Successors.end(), Successor); +    assert(Pos && "Successor does not exist"); +    Successors.erase(Pos); +  } + +protected: +  VPBlockBase(const unsigned char SC, const std::string &N) +      : SubclassID(SC), Name(N) {} + +public: +  /// An enumeration for keeping track of the concrete subclass of VPBlockBase +  /// that are actually instantiated. Values of this enumeration are kept in the +  /// SubclassID field of the VPBlockBase objects. They are used for concrete +  /// type identification. +  using VPBlockTy = enum { VPBasicBlockSC, VPRegionBlockSC }; + +  using VPBlocksTy = SmallVectorImpl<VPBlockBase *>; + +  virtual ~VPBlockBase() = default; + +  const std::string &getName() const { return Name; } + +  void setName(const Twine &newName) { Name = newName.str(); } + +  /// \return an ID for the concrete type of this object. +  /// This is used to implement the classof checks. This should not be used +  /// for any other purpose, as the values may change as LLVM evolves. +  unsigned getVPBlockID() const { return SubclassID; } + +  const VPRegionBlock *getParent() const { return Parent; } + +  void setParent(VPRegionBlock *P) { Parent = P; } + +  /// \return the VPBasicBlock that is the entry of this VPBlockBase, +  /// recursively, if the latter is a VPRegionBlock. Otherwise, if this +  /// VPBlockBase is a VPBasicBlock, it is returned. +  const VPBasicBlock *getEntryBasicBlock() const; +  VPBasicBlock *getEntryBasicBlock(); + +  /// \return the VPBasicBlock that is the exit of this VPBlockBase, +  /// recursively, if the latter is a VPRegionBlock. Otherwise, if this +  /// VPBlockBase is a VPBasicBlock, it is returned. +  const VPBasicBlock *getExitBasicBlock() const; +  VPBasicBlock *getExitBasicBlock(); + +  const VPBlocksTy &getSuccessors() const { return Successors; } +  VPBlocksTy &getSuccessors() { return Successors; } + +  const VPBlocksTy &getPredecessors() const { return Predecessors; } +  VPBlocksTy &getPredecessors() { return Predecessors; } + +  /// \return the successor of this VPBlockBase if it has a single successor. +  /// Otherwise return a null pointer. +  VPBlockBase *getSingleSuccessor() const { +    return (Successors.size() == 1 ? *Successors.begin() : nullptr); +  } + +  /// \return the predecessor of this VPBlockBase if it has a single +  /// predecessor. Otherwise return a null pointer. +  VPBlockBase *getSinglePredecessor() const { +    return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr); +  } + +  /// An Enclosing Block of a block B is any block containing B, including B +  /// itself. \return the closest enclosing block starting from "this", which +  /// has successors. \return the root enclosing block if all enclosing blocks +  /// have no successors. +  VPBlockBase *getEnclosingBlockWithSuccessors(); + +  /// \return the closest enclosing block starting from "this", which has +  /// predecessors. \return the root enclosing block if all enclosing blocks +  /// have no predecessors. +  VPBlockBase *getEnclosingBlockWithPredecessors(); + +  /// \return the successors either attached directly to this VPBlockBase or, if +  /// this VPBlockBase is the exit block of a VPRegionBlock and has no +  /// successors of its own, search recursively for the first enclosing +  /// VPRegionBlock that has successors and return them. If no such +  /// VPRegionBlock exists, return the (empty) successors of the topmost +  /// VPBlockBase reached. +  const VPBlocksTy &getHierarchicalSuccessors() { +    return getEnclosingBlockWithSuccessors()->getSuccessors(); +  } + +  /// \return the hierarchical successor of this VPBlockBase if it has a single +  /// hierarchical successor. Otherwise return a null pointer. +  VPBlockBase *getSingleHierarchicalSuccessor() { +    return getEnclosingBlockWithSuccessors()->getSingleSuccessor(); +  } + +  /// \return the predecessors either attached directly to this VPBlockBase or, +  /// if this VPBlockBase is the entry block of a VPRegionBlock and has no +  /// predecessors of its own, search recursively for the first enclosing +  /// VPRegionBlock that has predecessors and return them. If no such +  /// VPRegionBlock exists, return the (empty) predecessors of the topmost +  /// VPBlockBase reached. +  const VPBlocksTy &getHierarchicalPredecessors() { +    return getEnclosingBlockWithPredecessors()->getPredecessors(); +  } + +  /// \return the hierarchical predecessor of this VPBlockBase if it has a +  /// single hierarchical predecessor. Otherwise return a null pointer. +  VPBlockBase *getSingleHierarchicalPredecessor() { +    return getEnclosingBlockWithPredecessors()->getSinglePredecessor(); +  } + +  /// Sets a given VPBlockBase \p Successor as the single successor and \return +  /// \p Successor. The parent of this Block is copied to be the parent of +  /// \p Successor. +  VPBlockBase *setOneSuccessor(VPBlockBase *Successor) { +    assert(Successors.empty() && "Setting one successor when others exist."); +    appendSuccessor(Successor); +    Successor->appendPredecessor(this); +    Successor->Parent = Parent; +    return Successor; +  } + +  /// Sets two given VPBlockBases \p IfTrue and \p IfFalse to be the two +  /// successors. The parent of this Block is copied to be the parent of both +  /// \p IfTrue and \p IfFalse. +  void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) { +    assert(Successors.empty() && "Setting two successors when others exist."); +    appendSuccessor(IfTrue); +    appendSuccessor(IfFalse); +    IfTrue->appendPredecessor(this); +    IfFalse->appendPredecessor(this); +    IfTrue->Parent = Parent; +    IfFalse->Parent = Parent; +  } + +  void disconnectSuccessor(VPBlockBase *Successor) { +    assert(Successor && "Successor to disconnect is null."); +    removeSuccessor(Successor); +    Successor->removePredecessor(this); +  } + +  /// The method which generates the output IR that correspond to this +  /// VPBlockBase, thereby "executing" the VPlan. +  virtual void execute(struct VPTransformState *State) = 0; + +  /// Delete all blocks reachable from a given VPBlockBase, inclusive. +  static void deleteCFG(VPBlockBase *Entry); +}; + +/// VPRecipeBase is a base class modeling a sequence of one or more output IR +/// instructions. +class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> { +  friend VPBasicBlock; + +private: +  const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). + +  /// Each VPRecipe belongs to a single VPBasicBlock. +  VPBasicBlock *Parent = nullptr; + +public: +  /// An enumeration for keeping track of the concrete subclass of VPRecipeBase +  /// that is actually instantiated. Values of this enumeration are kept in the +  /// SubclassID field of the VPRecipeBase objects. They are used for concrete +  /// type identification. +  using VPRecipeTy = enum { +    VPBlendSC, +    VPBranchOnMaskSC, +    VPInstructionSC, +    VPInterleaveSC, +    VPPredInstPHISC, +    VPReplicateSC, +    VPWidenIntOrFpInductionSC, +    VPWidenMemoryInstructionSC, +    VPWidenPHISC, +    VPWidenSC, +  }; + +  VPRecipeBase(const unsigned char SC) : SubclassID(SC) {} +  virtual ~VPRecipeBase() = default; + +  /// \return an ID for the concrete type of this object. +  /// This is used to implement the classof checks. This should not be used +  /// for any other purpose, as the values may change as LLVM evolves. +  unsigned getVPRecipeID() const { return SubclassID; } + +  /// \return the VPBasicBlock which this VPRecipe belongs to. +  VPBasicBlock *getParent() { return Parent; } +  const VPBasicBlock *getParent() const { return Parent; } + +  /// The method which generates the output IR instructions that correspond to +  /// this VPRecipe, thereby "executing" the VPlan. +  virtual void execute(struct VPTransformState &State) = 0; + +  /// Each recipe prints itself. +  virtual void print(raw_ostream &O, const Twine &Indent) const = 0; +}; + +/// This is a concrete Recipe that models a single VPlan-level instruction. +/// While as any Recipe it may generate a sequence of IR instructions when +/// executed, these instructions would always form a single-def expression as +/// the VPInstruction is also a single def-use vertex. +class VPInstruction : public VPUser, public VPRecipeBase { +public: +  /// VPlan opcodes, extending LLVM IR with idiomatics instructions. +  enum { Not = Instruction::OtherOpsEnd + 1 }; + +private: +  typedef unsigned char OpcodeTy; +  OpcodeTy Opcode; + +  /// Utility method serving execute(): generates a single instance of the +  /// modeled instruction. +  void generateInstruction(VPTransformState &State, unsigned Part); + +public: +  VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) +      : VPUser(VPValue::VPInstructionSC, Operands), +        VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {} + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPValue *V) { +    return V->getVPValueID() == VPValue::VPInstructionSC; +  } + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPRecipeBase *R) { +    return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC; +  } + +  unsigned getOpcode() const { return Opcode; } + +  /// Generate the instruction. +  /// TODO: We currently execute only per-part unless a specific instance is +  /// provided. +  void execute(VPTransformState &State) override; + +  /// Print the Recipe. +  void print(raw_ostream &O, const Twine &Indent) const override; + +  /// Print the VPInstruction. +  void print(raw_ostream &O) const; +}; + +/// VPWidenRecipe is a recipe for producing a copy of vector type for each +/// Instruction in its ingredients independently, in order. This recipe covers +/// most of the traditional vectorization cases where each ingredient transforms +/// into a vectorized version of itself. +class VPWidenRecipe : public VPRecipeBase { +private: +  /// Hold the ingredients by pointing to their original BasicBlock location. +  BasicBlock::iterator Begin; +  BasicBlock::iterator End; + +public: +  VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) { +    End = I->getIterator(); +    Begin = End++; +  } + +  ~VPWidenRecipe() override = default; + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPRecipeBase *V) { +    return V->getVPRecipeID() == VPRecipeBase::VPWidenSC; +  } + +  /// Produce widened copies of all Ingredients. +  void execute(VPTransformState &State) override; + +  /// Augment the recipe to include Instr, if it lies at its End. +  bool appendInstruction(Instruction *Instr) { +    if (End != Instr->getIterator()) +      return false; +    End++; +    return true; +  } + +  /// Print the recipe. +  void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// A recipe for handling phi nodes of integer and floating-point inductions, +/// producing their vector and scalar values. +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { +private: +  PHINode *IV; +  TruncInst *Trunc; + +public: +  VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr) +      : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {} +  ~VPWidenIntOrFpInductionRecipe() override = default; + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPRecipeBase *V) { +    return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC; +  } + +  /// Generate the vectorized and scalarized versions of the phi node as +  /// needed by their users. +  void execute(VPTransformState &State) override; + +  /// Print the recipe. +  void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// A recipe for handling all phi nodes except for integer and FP inductions. +class VPWidenPHIRecipe : public VPRecipeBase { +private: +  PHINode *Phi; + +public: +  VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {} +  ~VPWidenPHIRecipe() override = default; + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPRecipeBase *V) { +    return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC; +  } + +  /// Generate the phi/select nodes. +  void execute(VPTransformState &State) override; + +  /// Print the recipe. +  void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// A recipe for vectorizing a phi-node as a sequence of mask-based select +/// instructions. +class VPBlendRecipe : public VPRecipeBase { +private: +  PHINode *Phi; + +  /// The blend operation is a User of a mask, if not null. +  std::unique_ptr<VPUser> User; + +public: +  VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Masks) +      : VPRecipeBase(VPBlendSC), Phi(Phi) { +    assert((Phi->getNumIncomingValues() == 1 || +            Phi->getNumIncomingValues() == Masks.size()) && +           "Expected the same number of incoming values and masks"); +    if (!Masks.empty()) +      User.reset(new VPUser(Masks)); +  } + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPRecipeBase *V) { +    return V->getVPRecipeID() == VPRecipeBase::VPBlendSC; +  } + +  /// Generate the phi/select nodes. +  void execute(VPTransformState &State) override; + +  /// Print the recipe. +  void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// VPInterleaveRecipe is a recipe for transforming an interleave group of load +/// or stores into one wide load/store and shuffles. +class VPInterleaveRecipe : public VPRecipeBase { +private: +  const InterleaveGroup *IG; + +public: +  VPInterleaveRecipe(const InterleaveGroup *IG) +      : VPRecipeBase(VPInterleaveSC), IG(IG) {} +  ~VPInterleaveRecipe() override = default; + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPRecipeBase *V) { +    return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; +  } + +  /// Generate the wide load or store, and shuffles. +  void execute(VPTransformState &State) override; + +  /// Print the recipe. +  void print(raw_ostream &O, const Twine &Indent) const override; + +  const InterleaveGroup *getInterleaveGroup() { return IG; } +}; + +/// VPReplicateRecipe replicates a given instruction producing multiple scalar +/// copies of the original scalar type, one per lane, instead of producing a +/// single copy of widened type for all lanes. If the instruction is known to be +/// uniform only one copy, per lane zero, will be generated. +class VPReplicateRecipe : public VPRecipeBase { +private: +  /// The instruction being replicated. +  Instruction *Ingredient; + +  /// Indicator if only a single replica per lane is needed. +  bool IsUniform; + +  /// Indicator if the replicas are also predicated. +  bool IsPredicated; + +  /// Indicator if the scalar values should also be packed into a vector. +  bool AlsoPack; + +public: +  VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false) +      : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform), +        IsPredicated(IsPredicated) { +    // Retain the previous behavior of predicateInstructions(), where an +    // insert-element of a predicated instruction got hoisted into the +    // predicated basic block iff it was its only user. This is achieved by +    // having predicated instructions also pack their values into a vector by +    // default unless they have a replicated user which uses their scalar value. +    AlsoPack = IsPredicated && !I->use_empty(); +  } + +  ~VPReplicateRecipe() override = default; + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPRecipeBase *V) { +    return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC; +  } + +  /// Generate replicas of the desired Ingredient. Replicas will be generated +  /// for all parts and lanes unless a specific part and lane are specified in +  /// the \p State. +  void execute(VPTransformState &State) override; + +  void setAlsoPack(bool Pack) { AlsoPack = Pack; } + +  /// Print the recipe. +  void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// A recipe for generating conditional branches on the bits of a mask. +class VPBranchOnMaskRecipe : public VPRecipeBase { +private: +  std::unique_ptr<VPUser> User; + +public: +  VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) { +    if (BlockInMask) // nullptr means all-one mask. +      User.reset(new VPUser({BlockInMask})); +  } + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPRecipeBase *V) { +    return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC; +  } + +  /// Generate the extraction of the appropriate bit from the block mask and the +  /// conditional branch. +  void execute(VPTransformState &State) override; + +  /// Print the recipe. +  void print(raw_ostream &O, const Twine &Indent) const override { +    O << " +\n" << Indent << "\"BRANCH-ON-MASK "; +    if (User) +      O << *User->getOperand(0); +    else +      O << " All-One"; +    O << "\\l\""; +  } +}; + +/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when +/// control converges back from a Branch-on-Mask. The phi nodes are needed in +/// order to merge values that are set under such a branch and feed their uses. +/// The phi nodes can be scalar or vector depending on the users of the value. +/// This recipe works in concert with VPBranchOnMaskRecipe. +class VPPredInstPHIRecipe : public VPRecipeBase { +private: +  Instruction *PredInst; + +public: +  /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi +  /// nodes after merging back from a Branch-on-Mask. +  VPPredInstPHIRecipe(Instruction *PredInst) +      : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {} +  ~VPPredInstPHIRecipe() override = default; + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPRecipeBase *V) { +    return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC; +  } + +  /// Generates phi nodes for live-outs as needed to retain SSA form. +  void execute(VPTransformState &State) override; + +  /// Print the recipe. +  void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// A Recipe for widening load/store operations. +/// TODO: We currently execute only per-part unless a specific instance is +/// provided. +class VPWidenMemoryInstructionRecipe : public VPRecipeBase { +private: +  Instruction &Instr; +  std::unique_ptr<VPUser> User; + +public: +  VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask) +      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) { +    if (Mask) // Create a VPInstruction to register as a user of the mask. +      User.reset(new VPUser({Mask})); +  } + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPRecipeBase *V) { +    return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC; +  } + +  /// Generate the wide load/store. +  void execute(VPTransformState &State) override; + +  /// Print the recipe. +  void print(raw_ostream &O, const Twine &Indent) const override; +}; + +/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It +/// holds a sequence of zero or more VPRecipe's each representing a sequence of +/// output IR instructions. +class VPBasicBlock : public VPBlockBase { +public: +  using RecipeListTy = iplist<VPRecipeBase>; + +private: +  /// The VPRecipes held in the order of output instructions to generate. +  RecipeListTy Recipes; + +public: +  VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) +      : VPBlockBase(VPBasicBlockSC, Name.str()) { +    if (Recipe) +      appendRecipe(Recipe); +  } + +  ~VPBasicBlock() override { Recipes.clear(); } + +  /// Instruction iterators... +  using iterator = RecipeListTy::iterator; +  using const_iterator = RecipeListTy::const_iterator; +  using reverse_iterator = RecipeListTy::reverse_iterator; +  using const_reverse_iterator = RecipeListTy::const_reverse_iterator; + +  //===--------------------------------------------------------------------===// +  /// Recipe iterator methods +  /// +  inline iterator begin() { return Recipes.begin(); } +  inline const_iterator begin() const { return Recipes.begin(); } +  inline iterator end() { return Recipes.end(); } +  inline const_iterator end() const { return Recipes.end(); } + +  inline reverse_iterator rbegin() { return Recipes.rbegin(); } +  inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); } +  inline reverse_iterator rend() { return Recipes.rend(); } +  inline const_reverse_iterator rend() const { return Recipes.rend(); } + +  inline size_t size() const { return Recipes.size(); } +  inline bool empty() const { return Recipes.empty(); } +  inline const VPRecipeBase &front() const { return Recipes.front(); } +  inline VPRecipeBase &front() { return Recipes.front(); } +  inline const VPRecipeBase &back() const { return Recipes.back(); } +  inline VPRecipeBase &back() { return Recipes.back(); } + +  /// \brief Returns a pointer to a member of the recipe list. +  static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) { +    return &VPBasicBlock::Recipes; +  } + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPBlockBase *V) { +    return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC; +  } + +  void insert(VPRecipeBase *Recipe, iterator InsertPt) { +    assert(Recipe && "No recipe to append."); +    assert(!Recipe->Parent && "Recipe already in VPlan"); +    Recipe->Parent = this; +    Recipes.insert(InsertPt, Recipe); +  } + +  /// Augment the existing recipes of a VPBasicBlock with an additional +  /// \p Recipe as the last recipe. +  void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); } + +  /// The method which generates the output IR instructions that correspond to +  /// this VPBasicBlock, thereby "executing" the VPlan. +  void execute(struct VPTransformState *State) override; + +private: +  /// Create an IR BasicBlock to hold the output instructions generated by this +  /// VPBasicBlock, and return it. Update the CFGState accordingly. +  BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG); +}; + +/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks +/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG. +/// A VPRegionBlock may indicate that its contents are to be replicated several +/// times. This is designed to support predicated scalarization, in which a +/// scalar if-then code structure needs to be generated VF * UF times. Having +/// this replication indicator helps to keep a single model for multiple +/// candidate VF's. The actual replication takes place only once the desired VF +/// and UF have been determined. +class VPRegionBlock : public VPBlockBase { +private: +  /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. +  VPBlockBase *Entry; + +  /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock. +  VPBlockBase *Exit; + +  /// An indicator whether this region is to generate multiple replicated +  /// instances of output IR corresponding to its VPBlockBases. +  bool IsReplicator; + +public: +  VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit, +                const std::string &Name = "", bool IsReplicator = false) +      : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit), +        IsReplicator(IsReplicator) { +    assert(Entry->getPredecessors().empty() && "Entry block has predecessors."); +    assert(Exit->getSuccessors().empty() && "Exit block has successors."); +    Entry->setParent(this); +    Exit->setParent(this); +  } + +  ~VPRegionBlock() override { +    if (Entry) +      deleteCFG(Entry); +  } + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPBlockBase *V) { +    return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC; +  } + +  const VPBlockBase *getEntry() const { return Entry; } +  VPBlockBase *getEntry() { return Entry; } + +  const VPBlockBase *getExit() const { return Exit; } +  VPBlockBase *getExit() { return Exit; } + +  /// An indicator whether this region is to generate multiple replicated +  /// instances of output IR corresponding to its VPBlockBases. +  bool isReplicator() const { return IsReplicator; } + +  /// The method which generates the output IR instructions that correspond to +  /// this VPRegionBlock, thereby "executing" the VPlan. +  void execute(struct VPTransformState *State) override; +}; + +/// VPlan models a candidate for vectorization, encoding various decisions take +/// to produce efficient output IR, including which branches, basic-blocks and +/// output IR instructions to generate, and their cost. VPlan holds a +/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry +/// VPBlock. +class VPlan { +  friend class VPlanPrinter; + +private: +  /// Hold the single entry to the Hierarchical CFG of the VPlan. +  VPBlockBase *Entry; + +  /// Holds the VFs applicable to this VPlan. +  SmallSet<unsigned, 2> VFs; + +  /// Holds the name of the VPlan, for printing. +  std::string Name; + +  /// Holds a mapping between Values and their corresponding VPValue inside +  /// VPlan. +  Value2VPValueTy Value2VPValue; + +public: +  VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {} + +  ~VPlan() { +    if (Entry) +      VPBlockBase::deleteCFG(Entry); +    for (auto &MapEntry : Value2VPValue) +      delete MapEntry.second; +  } + +  /// Generate the IR code for this VPlan. +  void execute(struct VPTransformState *State); + +  VPBlockBase *getEntry() { return Entry; } +  const VPBlockBase *getEntry() const { return Entry; } + +  VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; } + +  void addVF(unsigned VF) { VFs.insert(VF); } + +  bool hasVF(unsigned VF) { return VFs.count(VF); } + +  const std::string &getName() const { return Name; } + +  void setName(const Twine &newName) { Name = newName.str(); } + +  void addVPValue(Value *V) { +    assert(V && "Trying to add a null Value to VPlan"); +    assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); +    Value2VPValue[V] = new VPValue(); +  } + +  VPValue *getVPValue(Value *V) { +    assert(V && "Trying to get the VPValue of a null Value"); +    assert(Value2VPValue.count(V) && "Value does not exist in VPlan"); +    return Value2VPValue[V]; +  } + +private: +  /// Add to the given dominator tree the header block and every new basic block +  /// that was created between it and the latch block, inclusive. +  static void updateDominatorTree(DominatorTree *DT, +                                  BasicBlock *LoopPreHeaderBB, +                                  BasicBlock *LoopLatchBB); +}; + +/// VPlanPrinter prints a given VPlan to a given output stream. The printing is +/// indented and follows the dot format. +class VPlanPrinter { +  friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan); +  friend inline raw_ostream &operator<<(raw_ostream &OS, +                                        const struct VPlanIngredient &I); + +private: +  raw_ostream &OS; +  VPlan &Plan; +  unsigned Depth; +  unsigned TabWidth = 2; +  std::string Indent; +  unsigned BID = 0; +  SmallDenseMap<const VPBlockBase *, unsigned> BlockID; + +  VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {} + +  /// Handle indentation. +  void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } + +  /// Print a given \p Block of the Plan. +  void dumpBlock(const VPBlockBase *Block); + +  /// Print the information related to the CFG edges going out of a given +  /// \p Block, followed by printing the successor blocks themselves. +  void dumpEdges(const VPBlockBase *Block); + +  /// Print a given \p BasicBlock, including its VPRecipes, followed by printing +  /// its successor blocks. +  void dumpBasicBlock(const VPBasicBlock *BasicBlock); + +  /// Print a given \p Region of the Plan. +  void dumpRegion(const VPRegionBlock *Region); + +  unsigned getOrCreateBID(const VPBlockBase *Block) { +    return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++; +  } + +  const Twine getOrCreateName(const VPBlockBase *Block); + +  const Twine getUID(const VPBlockBase *Block); + +  /// Print the information related to a CFG edge between two VPBlockBases. +  void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, +                const Twine &Label); + +  void dump(); + +  static void printAsIngredient(raw_ostream &O, Value *V); +}; + +struct VPlanIngredient { +  Value *V; + +  VPlanIngredient(Value *V) : V(V) {} +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { +  VPlanPrinter::printAsIngredient(OS, I.V); +  return OS; +} + +inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) { +  VPlanPrinter Printer(OS, Plan); +  Printer.dump(); +  return OS; +} + +//===--------------------------------------------------------------------===// +// GraphTraits specializations for VPlan/VPRegionBlock Control-Flow Graphs  // +//===--------------------------------------------------------------------===// + +// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a +// graph of VPBlockBase nodes... + +template <> struct GraphTraits<VPBlockBase *> { +  using NodeRef = VPBlockBase *; +  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; + +  static NodeRef getEntryNode(NodeRef N) { return N; } + +  static inline ChildIteratorType child_begin(NodeRef N) { +    return N->getSuccessors().begin(); +  } + +  static inline ChildIteratorType child_end(NodeRef N) { +    return N->getSuccessors().end(); +  } +}; + +template <> struct GraphTraits<const VPBlockBase *> { +  using NodeRef = const VPBlockBase *; +  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator; + +  static NodeRef getEntryNode(NodeRef N) { return N; } + +  static inline ChildIteratorType child_begin(NodeRef N) { +    return N->getSuccessors().begin(); +  } + +  static inline ChildIteratorType child_end(NodeRef N) { +    return N->getSuccessors().end(); +  } +}; + +// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a +// graph of VPBlockBase nodes... and to walk it in inverse order. Inverse order +// for a VPBlockBase is considered to be when traversing the predecessors of a +// VPBlockBase instead of its successors. +template <> struct GraphTraits<Inverse<VPBlockBase *>> { +  using NodeRef = VPBlockBase *; +  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; + +  static Inverse<VPBlockBase *> getEntryNode(Inverse<VPBlockBase *> B) { +    return B; +  } + +  static inline ChildIteratorType child_begin(NodeRef N) { +    return N->getPredecessors().begin(); +  } + +  static inline ChildIteratorType child_end(NodeRef N) { +    return N->getPredecessors().end(); +  } +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/lib/Transforms/Vectorize/VPlanBuilder.h b/lib/Transforms/Vectorize/VPlanBuilder.h new file mode 100644 index 000000000000..d6eb3397d044 --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanBuilder.h @@ -0,0 +1,61 @@ +//===- VPlanBuilder.h - A VPlan utility for constructing VPInstructions ---===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides a VPlan-based builder utility analogous to IRBuilder. +/// It provides an instruction-level API for generating VPInstructions while +/// abstracting away the Recipe manipulation details. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H + +#include "VPlan.h" + +namespace llvm { + +class VPBuilder { +private: +  VPBasicBlock *BB = nullptr; +  VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); + +  VPInstruction *createInstruction(unsigned Opcode, +                                   std::initializer_list<VPValue *> Operands) { +    VPInstruction *Instr = new VPInstruction(Opcode, Operands); +    BB->insert(Instr, InsertPt); +    return Instr; +  } + +public: +  VPBuilder() {} + +  /// \brief This specifies that created VPInstructions should be appended to +  /// the end of the specified block. +  void setInsertPoint(VPBasicBlock *TheBB) { +    assert(TheBB && "Attempting to set a null insert point"); +    BB = TheBB; +    InsertPt = BB->end(); +  } + +  VPValue *createNot(VPValue *Operand) { +    return createInstruction(VPInstruction::Not, {Operand}); +  } + +  VPValue *createAnd(VPValue *LHS, VPValue *RHS) { +    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}); +  } + +  VPValue *createOr(VPValue *LHS, VPValue *RHS) { +    return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); +  } +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_BUILDER_H diff --git a/lib/Transforms/Vectorize/VPlanValue.h b/lib/Transforms/Vectorize/VPlanValue.h new file mode 100644 index 000000000000..50966891e0eb --- /dev/null +++ b/lib/Transforms/Vectorize/VPlanValue.h @@ -0,0 +1,146 @@ +//===- VPlanValue.h - Represent Values in Vectorizer Plan -----------------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declarations of the entities induced by Vectorization +/// Plans, e.g. the instructions the VPlan intends to generate if executed. +/// VPlan models the following entities: +/// VPValue +///  |-- VPUser +///  |    |-- VPInstruction +/// These are documented in docs/VectorizationPlan.rst. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +// Forward declarations. +class VPUser; + +// This is the base class of the VPlan Def/Use graph, used for modeling the data +// flow into, within and out of the VPlan. VPValues can stand for live-ins +// coming from the input IR, instructions which VPlan will generate if executed +// and live-outs which the VPlan will need to fix accordingly. +class VPValue { +private: +  const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). + +  SmallVector<VPUser *, 1> Users; + +protected: +  VPValue(const unsigned char SC) : SubclassID(SC) {} + +public: +  /// An enumeration for keeping track of the concrete subclass of VPValue that +  /// are actually instantiated. Values of this enumeration are kept in the +  /// SubclassID field of the VPValue objects. They are used for concrete +  /// type identification. +  enum { VPValueSC, VPUserSC, VPInstructionSC }; + +  VPValue() : SubclassID(VPValueSC) {} +  VPValue(const VPValue &) = delete; +  VPValue &operator=(const VPValue &) = delete; + +  /// \return an ID for the concrete type of this object. +  /// This is used to implement the classof checks. This should not be used +  /// for any other purpose, as the values may change as LLVM evolves. +  unsigned getVPValueID() const { return SubclassID; } + +  void printAsOperand(raw_ostream &OS) const { +    OS << "%vp" << (unsigned short)(unsigned long long)this; +  } + +  unsigned getNumUsers() const { return Users.size(); } +  void addUser(VPUser &User) { Users.push_back(&User); } + +  typedef SmallVectorImpl<VPUser *>::iterator user_iterator; +  typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator; +  typedef iterator_range<user_iterator> user_range; +  typedef iterator_range<const_user_iterator> const_user_range; + +  user_iterator user_begin() { return Users.begin(); } +  const_user_iterator user_begin() const { return Users.begin(); } +  user_iterator user_end() { return Users.end(); } +  const_user_iterator user_end() const { return Users.end(); } +  user_range users() { return user_range(user_begin(), user_end()); } +  const_user_range users() const { +    return const_user_range(user_begin(), user_end()); +  } +}; + +typedef DenseMap<Value *, VPValue *> Value2VPValueTy; +typedef DenseMap<VPValue *, Value *> VPValue2ValueTy; + +raw_ostream &operator<<(raw_ostream &OS, const VPValue &V); + +/// This class augments VPValue with operands which provide the inverse def-use +/// edges from VPValue's users to their defs. +class VPUser : public VPValue { +private: +  SmallVector<VPValue *, 2> Operands; + +  void addOperand(VPValue *Operand) { +    Operands.push_back(Operand); +    Operand->addUser(*this); +  } + +protected: +  VPUser(const unsigned char SC) : VPValue(SC) {} +  VPUser(const unsigned char SC, ArrayRef<VPValue *> Operands) : VPValue(SC) { +    for (VPValue *Operand : Operands) +      addOperand(Operand); +  } + +public: +  VPUser() : VPValue(VPValue::VPUserSC) {} +  VPUser(ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPUserSC, Operands) {} +  VPUser(std::initializer_list<VPValue *> Operands) +      : VPUser(ArrayRef<VPValue *>(Operands)) {} +  VPUser(const VPUser &) = delete; +  VPUser &operator=(const VPUser &) = delete; + +  /// Method to support type inquiry through isa, cast, and dyn_cast. +  static inline bool classof(const VPValue *V) { +    return V->getVPValueID() >= VPUserSC && +           V->getVPValueID() <= VPInstructionSC; +  } + +  unsigned getNumOperands() const { return Operands.size(); } +  inline VPValue *getOperand(unsigned N) const { +    assert(N < Operands.size() && "Operand index out of bounds"); +    return Operands[N]; +  } + +  typedef SmallVectorImpl<VPValue *>::iterator operand_iterator; +  typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator; +  typedef iterator_range<operand_iterator> operand_range; +  typedef iterator_range<const_operand_iterator> const_operand_range; + +  operand_iterator op_begin() { return Operands.begin(); } +  const_operand_iterator op_begin() const { return Operands.begin(); } +  operand_iterator op_end() { return Operands.end(); } +  const_operand_iterator op_end() const { return Operands.end(); } +  operand_range operands() { return operand_range(op_begin(), op_end()); } +  const_operand_range operands() const { +    return const_operand_range(op_begin(), op_end()); +  } +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp index fb2f509dcbaa..b04905bfc6fa 100644 --- a/lib/Transforms/Vectorize/Vectorize.cpp +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -18,7 +18,6 @@  #include "llvm-c/Transforms/Vectorize.h"  #include "llvm/Analysis/Passes.h"  #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Verifier.h"  #include "llvm/InitializePasses.h"  using namespace llvm;  | 
