diff options
Diffstat (limited to 'lib/Transforms/Vectorize/SLPVectorizer.cpp')
| -rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 1926 | 
1 files changed, 1196 insertions, 730 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index a7ccd3faec44e..ac8c4f046c6ff 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -161,7 +161,7 @@ static const unsigned MaxMemDepDistance = 160;  /// regions to be handled.  static const int MinScheduleRegionSize = 16; -/// \brief Predicate for the element types that the SLP vectorizer supports. +/// Predicate for the element types that the SLP vectorizer supports.  ///  /// The most important thing to filter here are types which are invalid in LLVM  /// vectors. We also filter target specific types which have absolutely no @@ -246,13 +246,15 @@ static bool isSplat(ArrayRef<Value *> VL) {  /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3  /// ret <4 x i8> %ins4  /// InstCombiner transforms this into a shuffle and vector mul +/// TODO: Can we split off and reuse the shuffle mask detection from +/// TargetTransformInfo::getInstructionThroughput?  static Optional<TargetTransformInfo::ShuffleKind>  isShuffle(ArrayRef<Value *> VL) {    auto *EI0 = cast<ExtractElementInst>(VL[0]);    unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();    Value *Vec1 = nullptr;    Value *Vec2 = nullptr; -  enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute}; +  enum ShuffleMode { Unknown, Select, Permute };    ShuffleMode CommonShuffleMode = Unknown;    for (unsigned I = 0, E = VL.size(); I < E; ++I) {      auto *EI = cast<ExtractElementInst>(VL[I]); @@ -272,7 +274,11 @@ isShuffle(ArrayRef<Value *> VL) {        continue;      // For correct shuffling we have to have at most 2 different vector operands      // in all extractelement instructions. -    if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2) +    if (!Vec1 || Vec1 == Vec) +      Vec1 = Vec; +    else if (!Vec2 || Vec2 == Vec) +      Vec2 = Vec; +    else        return None;      if (CommonShuffleMode == Permute)        continue; @@ -282,119 +288,17 @@ isShuffle(ArrayRef<Value *> VL) {        CommonShuffleMode = Permute;        continue;      } -    // Check the shuffle mode for the current operation. -    if (!Vec1) -      Vec1 = Vec; -    else if (Vec != Vec1) -      Vec2 = Vec; -    // Example: shufflevector A, B, <0,5,2,7> -    // I is odd and IntIdx for A == I - FirstAlternate shuffle. -    // I is even and IntIdx for B == I - FirstAlternate shuffle. -    // Example: shufflevector A, B, <4,1,6,3> -    // I is even and IntIdx for A == I - SecondAlternate shuffle. -    // I is odd and IntIdx for B == I - SecondAlternate shuffle. -    const bool IIsEven = I & 1; -    const bool CurrVecIsA = Vec == Vec1; -    const bool IIsOdd = !IIsEven; -    const bool CurrVecIsB = !CurrVecIsA; -    ShuffleMode CurrentShuffleMode = -        ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate -                                                            : SecondAlternate; -    // Common mode is not set or the same as the shuffle mode of the current -    // operation - alternate. -    if (CommonShuffleMode == Unknown) -      CommonShuffleMode = CurrentShuffleMode; -    // Common shuffle mode is not the same as the shuffle mode of the current -    // operation - permutation. -    if (CommonShuffleMode != CurrentShuffleMode) -      CommonShuffleMode = Permute; +    CommonShuffleMode = Select;    }    // If we're not crossing lanes in different vectors, consider it as blending. -  if ((CommonShuffleMode == FirstAlternate || -       CommonShuffleMode == SecondAlternate) && -      Vec2) -    return TargetTransformInfo::SK_Alternate; +  if (CommonShuffleMode == Select && Vec2) +    return TargetTransformInfo::SK_Select;    // If Vec2 was never used, we have a permutation of a single vector, otherwise    // we have permutation of 2 vectors.    return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc                : TargetTransformInfo::SK_PermuteSingleSrc;  } -///\returns Opcode that can be clubbed with \p Op to create an alternate -/// sequence which can later be merged as a ShuffleVector instruction. -static unsigned getAltOpcode(unsigned Op) { -  switch (Op) { -  case Instruction::FAdd: -    return Instruction::FSub; -  case Instruction::FSub: -    return Instruction::FAdd; -  case Instruction::Add: -    return Instruction::Sub; -  case Instruction::Sub: -    return Instruction::Add; -  default: -    return 0; -  } -} - -static bool isOdd(unsigned Value) { -  return Value & 1; -} - -static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode, -                            unsigned CheckedOpcode) { -  return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode; -} - -/// Chooses the correct key for scheduling data. If \p Op has the same (or -/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p -/// OpValue. -static Value *isOneOf(Value *OpValue, Value *Op) { -  auto *I = dyn_cast<Instruction>(Op); -  if (!I) -    return OpValue; -  auto *OpInst = cast<Instruction>(OpValue); -  unsigned OpInstOpcode = OpInst->getOpcode(); -  unsigned IOpcode = I->getOpcode(); -  if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode)) -    return Op; -  return OpValue; -} - -namespace { - -/// Contains data for the instructions going to be vectorized. -struct RawInstructionsData { -  /// Main Opcode of the instructions going to be vectorized. -  unsigned Opcode = 0; - -  /// The list of instructions have some instructions with alternate opcodes. -  bool HasAltOpcodes = false; -}; - -} // end anonymous namespace - -/// Checks the list of the vectorized instructions \p VL and returns info about -/// this list. -static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) { -  auto *I0 = dyn_cast<Instruction>(VL[0]); -  if (!I0) -    return {}; -  RawInstructionsData Res; -  unsigned Opcode = I0->getOpcode(); -  // Walk through the list of the vectorized instructions -  // in order to check its structure described by RawInstructionsData. -  for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) { -    auto *I = dyn_cast<Instruction>(VL[Cnt]); -    if (!I) -      return {}; -    if (Opcode != I->getOpcode()) -      Res.HasAltOpcodes = true; -  } -  Res.Opcode = Opcode; -  return Res; -} -  namespace {  /// Main data required for vectorization of instructions. @@ -402,42 +306,90 @@ struct InstructionsState {    /// The very first instruction in the list with the main opcode.    Value *OpValue = nullptr; -  /// The main opcode for the list of instructions. -  unsigned Opcode = 0; +  /// The main/alternate instruction. +  Instruction *MainOp = nullptr; +  Instruction *AltOp = nullptr; + +  /// The main/alternate opcodes for the list of instructions. +  unsigned getOpcode() const { +    return MainOp ? MainOp->getOpcode() : 0; +  } + +  unsigned getAltOpcode() const { +    return AltOp ? AltOp->getOpcode() : 0; +  }    /// Some of the instructions in the list have alternate opcodes. -  bool IsAltShuffle = false; +  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } + +  bool isOpcodeOrAlt(Instruction *I) const { +    unsigned CheckedOpcode = I->getOpcode(); +    return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; +  } -  InstructionsState() = default; -  InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle) -      : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {} +  InstructionsState() = delete; +  InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) +      : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}  };  } // end anonymous namespace +/// Chooses the correct key for scheduling data. If \p Op has the same (or +/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p +/// OpValue. +static Value *isOneOf(const InstructionsState &S, Value *Op) { +  auto *I = dyn_cast<Instruction>(Op); +  if (I && S.isOpcodeOrAlt(I)) +    return Op; +  return S.OpValue; +} +  /// \returns analysis of the Instructions in \p VL described in  /// InstructionsState, the Opcode that we suppose the whole list   /// could be vectorized even if its structure is diverse. -static InstructionsState getSameOpcode(ArrayRef<Value *> VL) { -  auto Res = getMainOpcode(VL); -  unsigned Opcode = Res.Opcode; -  if (!Res.HasAltOpcodes) -    return InstructionsState(VL[0], Opcode, false); -  auto *OpInst = cast<Instruction>(VL[0]); -  unsigned AltOpcode = getAltOpcode(Opcode); -  // Examine each element in the list instructions VL to determine -  // if some operations there could be considered as an alternative -  // (for example as subtraction relates to addition operation). +static InstructionsState getSameOpcode(ArrayRef<Value *> VL, +                                       unsigned BaseIndex = 0) { +  // Make sure these are all Instructions. +  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) +    return InstructionsState(VL[BaseIndex], nullptr, nullptr); + +  bool IsCastOp = isa<CastInst>(VL[BaseIndex]); +  bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]); +  unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode(); +  unsigned AltOpcode = Opcode; +  unsigned AltIndex = BaseIndex; + +  // Check for one alternate opcode from another BinaryOperator. +  // TODO - generalize to support all operators (types, calls etc.).    for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { -    auto *I = cast<Instruction>(VL[Cnt]); -    unsigned InstOpcode = I->getOpcode(); -    if ((Res.HasAltOpcodes && -         InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) || -        (!Res.HasAltOpcodes && InstOpcode != Opcode)) { -      return InstructionsState(OpInst, 0, false); -    } +    unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode(); +    if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) { +      if (InstOpcode == Opcode || InstOpcode == AltOpcode) +        continue; +      if (Opcode == AltOpcode) { +        AltOpcode = InstOpcode; +        AltIndex = Cnt; +        continue; +      } +    } else if (IsCastOp && isa<CastInst>(VL[Cnt])) { +      Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType(); +      Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType(); +      if (Ty0 == Ty1) { +        if (InstOpcode == Opcode || InstOpcode == AltOpcode) +          continue; +        if (Opcode == AltOpcode) { +          AltOpcode = InstOpcode; +          AltIndex = Cnt; +          continue; +        } +      } +    } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) +      continue; +    return InstructionsState(VL[BaseIndex], nullptr, nullptr);    } -  return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes); + +  return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]), +                           cast<Instruction>(VL[AltIndex]));  }  /// \returns true if all of the values in \p VL have the same type or false @@ -452,16 +404,21 @@ static bool allSameType(ArrayRef<Value *> VL) {  }  /// \returns True if Extract{Value,Element} instruction extracts element Idx. -static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) { -  assert(Opcode == Instruction::ExtractElement || -         Opcode == Instruction::ExtractValue); +static Optional<unsigned> getExtractIndex(Instruction *E) { +  unsigned Opcode = E->getOpcode(); +  assert((Opcode == Instruction::ExtractElement || +          Opcode == Instruction::ExtractValue) && +         "Expected extractelement or extractvalue instruction.");    if (Opcode == Instruction::ExtractElement) { -    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); -    return CI && CI->getZExtValue() == Idx; -  } else { -    ExtractValueInst *EI = cast<ExtractValueInst>(E); -    return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx; +    auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); +    if (!CI) +      return None; +    return CI->getZExtValue();    } +  ExtractValueInst *EI = cast<ExtractValueInst>(E); +  if (EI->getNumIndices() != 1) +    return None; +  return *EI->idx_begin();  }  /// \returns True if in-tree use also needs extract. This refers to @@ -549,7 +506,7 @@ public:        MinVecRegSize = TTI->getMinVectorRegisterBitWidth();    } -  /// \brief Vectorize the tree that starts with the elements in \p VL. +  /// Vectorize the tree that starts with the elements in \p VL.    /// Returns the vectorized root.    Value *vectorizeTree(); @@ -585,8 +542,8 @@ public:      ScalarToTreeEntry.clear();      MustGather.clear();      ExternalUses.clear(); -    NumLoadsWantToKeepOrder = 0; -    NumLoadsWantToChangeOrder = 0; +    NumOpsWantToKeepOrder.clear(); +    NumOpsWantToKeepOriginalOrder = 0;      for (auto &Iter : BlocksSchedules) {        BlockScheduling *BS = Iter.second.get();        BS->clear(); @@ -596,12 +553,22 @@ public:    unsigned getTreeSize() const { return VectorizableTree.size(); } -  /// \brief Perform LICM and CSE on the newly generated gather sequences. -  void optimizeGatherSequence(Function &F); +  /// Perform LICM and CSE on the newly generated gather sequences. +  void optimizeGatherSequence(); + +  /// \returns The best order of instructions for vectorization. +  Optional<ArrayRef<unsigned>> bestOrder() const { +    auto I = std::max_element( +        NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), +        [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, +           const decltype(NumOpsWantToKeepOrder)::value_type &D2) { +          return D1.second < D2.second; +        }); +    if (I == NumOpsWantToKeepOrder.end() || +        I->getSecond() <= NumOpsWantToKeepOriginalOrder) +      return None; -  /// \returns true if it is beneficial to reverse the vector order. -  bool shouldReorder() const { -    return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; +    return makeArrayRef(I->getFirst());    }    /// \return The vector element size in bits to use when vectorizing the @@ -625,7 +592,7 @@ public:      return MinVecRegSize;    } -  /// \brief Check if ArrayType or StructType is isomorphic to some VectorType. +  /// Check if ArrayType or StructType is isomorphic to some VectorType.    ///    /// \returns number of elements in vector if isomorphism exists, 0 otherwise.    unsigned canMapToVector(Type *T, const DataLayout &DL) const; @@ -648,9 +615,13 @@ private:    /// This is the recursive part of buildTree.    void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int); -  /// \returns True if the ExtractElement/ExtractValue instructions in VL can -  /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). -  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const; +  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can +  /// be vectorized to use the original vector (or aggregate "bitcast" to a +  /// vector) and sets \p CurrentOrder to the identity permutation; otherwise +  /// returns false, setting \p CurrentOrder to either an empty vector or a +  /// non-identity permutation that allows to reuse extract instructions. +  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, +                       SmallVectorImpl<unsigned> &CurrentOrder) const;    /// Vectorize a single entry in the tree.    Value *vectorizeTree(TreeEntry *E); @@ -658,22 +629,19 @@ private:    /// Vectorize a single entry in the tree, starting in \p VL.    Value *vectorizeTree(ArrayRef<Value *> VL); -  /// \returns the pointer to the vectorized value if \p VL is already -  /// vectorized, or NULL. They may happen in cycles. -  Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const; -    /// \returns the scalarization cost for this type. Scalarization in this    /// context means the creation of vectors from a group of scalars. -  int getGatherCost(Type *Ty); +  int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);    /// \returns the scalarization cost for this list of values. Assuming that    /// this subtree gets vectorized, we may need to extract the values from the    /// roots. This method calculates the cost of extracting the values.    int getGatherCost(ArrayRef<Value *> VL); -  /// \brief Set the Builder insert point to one after the last instruction in +  /// Set the Builder insert point to one after the last instruction in    /// the bundle -  void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue); +  void setInsertPointAfterBundle(ArrayRef<Value *> VL, +                                 const InstructionsState &S);    /// \returns a vector from a collection of scalars in \p VL.    Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); @@ -684,7 +652,8 @@ private:    /// \reorder commutative operands in alt shuffle if they result in    ///  vectorized code. -  void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL, +  void reorderAltShuffleOperands(const InstructionsState &S, +                                 ArrayRef<Value *> VL,                                   SmallVectorImpl<Value *> &Left,                                   SmallVectorImpl<Value *> &Right); @@ -698,8 +667,12 @@ private:      /// \returns true if the scalars in VL are equal to this entry.      bool isSame(ArrayRef<Value *> VL) const { -      assert(VL.size() == Scalars.size() && "Invalid size"); -      return std::equal(VL.begin(), VL.end(), Scalars.begin()); +      if (VL.size() == Scalars.size()) +        return std::equal(VL.begin(), VL.end(), Scalars.begin()); +      return VL.size() == ReuseShuffleIndices.size() && +             std::equal( +                 VL.begin(), VL.end(), ReuseShuffleIndices.begin(), +                 [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });      }      /// A vector of scalars. @@ -711,6 +684,12 @@ private:      /// Do we need to gather this sequence ?      bool NeedToGather = false; +    /// Does this sequence require some shuffling? +    SmallVector<unsigned, 4> ReuseShuffleIndices; + +    /// Does this entry require reordering? +    ArrayRef<unsigned> ReorderIndices; +      /// Points back to the VectorizableTree.      ///      /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has @@ -725,13 +704,17 @@ private:    };    /// Create a new VectorizableTree entry. -  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, -                          int &UserTreeIdx) { +  void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx, +                    ArrayRef<unsigned> ReuseShuffleIndices = None, +                    ArrayRef<unsigned> ReorderIndices = None) {      VectorizableTree.emplace_back(VectorizableTree);      int idx = VectorizableTree.size() - 1;      TreeEntry *Last = &VectorizableTree[idx];      Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());      Last->NeedToGather = !Vectorized; +    Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), +                                     ReuseShuffleIndices.end()); +    Last->ReorderIndices = ReorderIndices;      if (Vectorized) {        for (int i = 0, e = VL.size(); i != e; ++i) {          assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); @@ -744,7 +727,6 @@ private:      if (UserTreeIdx >= 0)        Last->UserTreeIndices.push_back(UserTreeIdx);      UserTreeIdx = idx; -    return Last;    }    /// -- Vectorization State -- @@ -758,13 +740,6 @@ private:      return nullptr;    } -  const TreeEntry *getTreeEntry(Value *V) const { -    auto I = ScalarToTreeEntry.find(V); -    if (I != ScalarToTreeEntry.end()) -      return &VectorizableTree[I->second]; -    return nullptr; -  } -    /// Maps a specific scalar to its tree entry.    SmallDenseMap<Value*, int> ScalarToTreeEntry; @@ -1038,7 +1013,7 @@ private:      template <typename ReadyListType>      void schedule(ScheduleData *SD, ReadyListType &ReadyList) {        SD->IsScheduled = true; -      DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n"); +      LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");        ScheduleData *BundleMember = SD;        while (BundleMember) { @@ -1061,8 +1036,8 @@ private:                assert(!DepBundle->IsScheduled &&                       "already scheduled bundle gets ready");                ReadyList.insert(DepBundle); -              DEBUG(dbgs() -                    << "SLP:    gets ready (def): " << *DepBundle << "\n"); +              LLVM_DEBUG(dbgs() +                         << "SLP:    gets ready (def): " << *DepBundle << "\n");              }            });          } @@ -1075,8 +1050,8 @@ private:              assert(!DepBundle->IsScheduled &&                     "already scheduled bundle gets ready");              ReadyList.insert(DepBundle); -            DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle -                         << "\n"); +            LLVM_DEBUG(dbgs() +                       << "SLP:    gets ready (mem): " << *DepBundle << "\n");            }          }          BundleMember = BundleMember->NextInBundle; @@ -1101,7 +1076,8 @@ private:          doForAllOpcodes(I, [&](ScheduleData *SD) {            if (SD->isSchedulingEntity() && SD->isReady()) {              ReadyList.insert(SD); -            DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n"); +            LLVM_DEBUG(dbgs() +                       << "SLP:    initially in ready list: " << *I << "\n");            }          });        } @@ -1110,7 +1086,8 @@ private:      /// Checks if a bundle of instructions can be scheduled, i.e. has no      /// cyclic dependencies. This is only a dry-run, no instructions are      /// actually moved at this stage. -    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue); +    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, +                           const InstructionsState &S);      /// Un-bundles a group of instructions.      void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); @@ -1120,7 +1097,7 @@ private:      /// Extends the scheduling region so that V is inside the region.      /// \returns true if the region size is within the limit. -    bool extendSchedulingRegion(Value *V, Value *OpValue); +    bool extendSchedulingRegion(Value *V, const InstructionsState &S);      /// Initialize the ScheduleData structures for new instructions in the      /// scheduling region. @@ -1201,11 +1178,38 @@ private:    /// List of users to ignore during scheduling and that don't need extracting.    ArrayRef<Value *> UserIgnoreList; -  // Number of load bundles that contain consecutive loads. -  int NumLoadsWantToKeepOrder = 0; +  using OrdersType = SmallVector<unsigned, 4>; +  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of +  /// sorted SmallVectors of unsigned. +  struct OrdersTypeDenseMapInfo { +    static OrdersType getEmptyKey() { +      OrdersType V; +      V.push_back(~1U); +      return V; +    } + +    static OrdersType getTombstoneKey() { +      OrdersType V; +      V.push_back(~2U); +      return V; +    } + +    static unsigned getHashValue(const OrdersType &V) { +      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); +    } + +    static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { +      return LHS == RHS; +    } +  }; -  // Number of load bundles that contain consecutive loads in reversed order. -  int NumLoadsWantToChangeOrder = 0; +  /// Contains orders of operations along with the number of bundles that have +  /// operations in this order. It stores only those orders that require +  /// reordering, if reordering is not required it is counted using \a +  /// NumOpsWantToKeepOriginalOrder. +  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder; +  /// Number of bundles that do not require reordering. +  unsigned NumOpsWantToKeepOriginalOrder = 0;    // Analysis and block reference.    Function *F; @@ -1242,7 +1246,7 @@ template <> struct GraphTraits<BoUpSLP *> {    /// NodeRef has to be a pointer per the GraphWriter.    using NodeRef = TreeEntry *; -  /// \brief Add the VectorizableTree to the index iterator to be able to return +  /// Add the VectorizableTree to the index iterator to be able to return    /// TreeEntry pointers.    struct ChildIteratorType        : public iterator_adaptor_base<ChildIteratorType, @@ -1340,17 +1344,22 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,      // For each lane:      for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {        Value *Scalar = Entry->Scalars[Lane]; +      int FoundLane = Lane; +      if (!Entry->ReuseShuffleIndices.empty()) { +        FoundLane = +            std::distance(Entry->ReuseShuffleIndices.begin(), +                          llvm::find(Entry->ReuseShuffleIndices, FoundLane)); +      }        // Check if the scalar is externally used as an extra arg.        auto ExtI = ExternallyUsedValues.find(Scalar);        if (ExtI != ExternallyUsedValues.end()) { -        DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " << -              Lane << " from " << *Scalar << ".\n"); -        ExternalUses.emplace_back(Scalar, nullptr, Lane); -        continue; +        LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " +                          << Lane << " from " << *Scalar << ".\n"); +        ExternalUses.emplace_back(Scalar, nullptr, FoundLane);        }        for (User *U : Scalar->users()) { -        DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); +        LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");          Instruction *UserInst = dyn_cast<Instruction>(U);          if (!UserInst) @@ -1364,8 +1373,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,            // be used.            if (UseScalar != U ||                !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { -            DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U -                         << ".\n"); +            LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U +                              << ".\n");              assert(!UseEntry->NeedToGather && "Bad state");              continue;            } @@ -1375,9 +1384,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,          if (is_contained(UserIgnoreList, UserInst))            continue; -        DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << -              Lane << " from " << *Scalar << ".\n"); -        ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); +        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " +                          << Lane << " from " << *Scalar << ".\n"); +        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));        }      }    } @@ -1389,28 +1398,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,    InstructionsState S = getSameOpcode(VL);    if (Depth == RecursionMaxDepth) { -    DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); +    LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");      newTreeEntry(VL, false, UserTreeIdx);      return;    }    // Don't handle vectors.    if (S.OpValue->getType()->isVectorTy()) { -    DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); +    LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");      newTreeEntry(VL, false, UserTreeIdx);      return;    }    if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))      if (SI->getValueOperand()->getType()->isVectorTy()) { -      DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); +      LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");        newTreeEntry(VL, false, UserTreeIdx);        return;      }    // If all of the operands are identical or constant we have a simple solution. -  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) { -    DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); +  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { +    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");      newTreeEntry(VL, false, UserTreeIdx);      return;    } @@ -1421,8 +1430,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,    // Don't vectorize ephemeral values.    for (unsigned i = 0, e = VL.size(); i != e; ++i) {      if (EphValues.count(VL[i])) { -      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << -            ") is ephemeral.\n"); +      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] +                        << ") is ephemeral.\n");        newTreeEntry(VL, false, UserTreeIdx);        return;      } @@ -1430,18 +1439,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,    // Check if this is a duplicate of another entry.    if (TreeEntry *E = getTreeEntry(S.OpValue)) { -    for (unsigned i = 0, e = VL.size(); i != e; ++i) { -      DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); -      if (E->Scalars[i] != VL[i]) { -        DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); -        newTreeEntry(VL, false, UserTreeIdx); -        return; -      } +    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); +    if (!E->isSame(VL)) { +      LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); +      newTreeEntry(VL, false, UserTreeIdx); +      return;      }      // Record the reuse of the tree node.  FIXME, currently this is only used to      // properly draw the graph rather than for the actual vectorization.      E->UserTreeIndices.push_back(UserTreeIdx); -    DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n"); +    LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue +                      << ".\n");      return;    } @@ -1451,8 +1459,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      if (!I)        continue;      if (getTreeEntry(I)) { -      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << -            ") is already in tree.\n"); +      LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] +                        << ") is already in tree.\n");        newTreeEntry(VL, false, UserTreeIdx);        return;      } @@ -1462,7 +1470,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,    // we need to gather the scalars.    for (unsigned i = 0, e = VL.size(); i != e; ++i) {      if (MustGather.count(VL[i])) { -      DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); +      LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");        newTreeEntry(VL, false, UserTreeIdx);        return;      } @@ -1476,19 +1484,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,    if (!DT->isReachableFromEntry(BB)) {      // Don't go into unreachable blocks. They may contain instructions with      // dependency cycles which confuse the final scheduling. -    DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); +    LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");      newTreeEntry(VL, false, UserTreeIdx);      return;    }    // Check that every instruction appears once in this bundle. -  for (unsigned i = 0, e = VL.size(); i < e; ++i) -    for (unsigned j = i + 1; j < e; ++j) -      if (VL[i] == VL[j]) { -        DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); -        newTreeEntry(VL, false, UserTreeIdx); -        return; -      } +  SmallVector<unsigned, 4> ReuseShuffleIndicies; +  SmallVector<Value *, 4> UniqueValues; +  DenseMap<Value *, unsigned> UniquePositions; +  for (Value *V : VL) { +    auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); +    ReuseShuffleIndicies.emplace_back(Res.first->second); +    if (Res.second) +      UniqueValues.emplace_back(V); +  } +  if (UniqueValues.size() == VL.size()) { +    ReuseShuffleIndicies.clear(); +  } else { +    LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); +    if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { +      LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); +      newTreeEntry(VL, false, UserTreeIdx); +      return; +    } +    VL = UniqueValues; +  }    auto &BSRef = BlocksSchedules[BB];    if (!BSRef) @@ -1496,18 +1517,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,    BlockScheduling &BS = *BSRef.get(); -  if (!BS.tryScheduleBundle(VL, this, S.OpValue)) { -    DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); +  if (!BS.tryScheduleBundle(VL, this, S)) { +    LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");      assert((!BS.getScheduleData(VL0) ||              !BS.getScheduleData(VL0)->isPartOfBundle()) &&             "tryScheduleBundle should cancelScheduling on failure"); -    newTreeEntry(VL, false, UserTreeIdx); +    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);      return;    } -  DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); +  LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); -  unsigned ShuffleOrOp = S.IsAltShuffle ? -                (unsigned) Instruction::ShuffleVector : S.Opcode; +  unsigned ShuffleOrOp = S.isAltShuffle() ? +                (unsigned) Instruction::ShuffleVector : S.getOpcode();    switch (ShuffleOrOp) {      case Instruction::PHI: {        PHINode *PH = dyn_cast<PHINode>(VL0); @@ -1518,15 +1539,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            TerminatorInst *Term = dyn_cast<TerminatorInst>(                cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));            if (Term) { -            DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); +            LLVM_DEBUG( +                dbgs() +                << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");              BS.cancelScheduling(VL, VL0); -            newTreeEntry(VL, false, UserTreeIdx); +            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);              return;            }          } -      newTreeEntry(VL, true, UserTreeIdx); -      DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); +      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); +      LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {          ValueList Operands; @@ -1541,13 +1564,35 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      }      case Instruction::ExtractValue:      case Instruction::ExtractElement: { -      bool Reuse = canReuseExtract(VL, VL0); +      OrdersType CurrentOrder; +      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);        if (Reuse) { -        DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); -      } else { -        BS.cancelScheduling(VL, VL0); +        LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); +        ++NumOpsWantToKeepOriginalOrder; +        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, +                     ReuseShuffleIndicies); +        return;        } -      newTreeEntry(VL, Reuse, UserTreeIdx); +      if (!CurrentOrder.empty()) { +        LLVM_DEBUG({ +          dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " +                    "with order"; +          for (unsigned Idx : CurrentOrder) +            dbgs() << " " << Idx; +          dbgs() << "\n"; +        }); +        // Insert new order with initial value 0, if it does not exist, +        // otherwise return the iterator to the existing one. +        auto StoredCurrentOrderAndNum = +            NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; +        ++StoredCurrentOrderAndNum->getSecond(); +        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, +                     StoredCurrentOrderAndNum->getFirst()); +        return; +      } +      LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); +      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); +      BS.cancelScheduling(VL, VL0);        return;      }      case Instruction::Load: { @@ -1562,62 +1607,67 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        if (DL->getTypeSizeInBits(ScalarTy) !=            DL->getTypeAllocSizeInBits(ScalarTy)) {          BS.cancelScheduling(VL, VL0); -        newTreeEntry(VL, false, UserTreeIdx); -        DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); +        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +        LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");          return;        }        // Make sure all loads in the bundle are simple - we can't vectorize        // atomic or volatile loads. -      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { -        LoadInst *L = cast<LoadInst>(VL[i]); +      SmallVector<Value *, 4> PointerOps(VL.size()); +      auto POIter = PointerOps.begin(); +      for (Value *V : VL) { +        auto *L = cast<LoadInst>(V);          if (!L->isSimple()) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); -          DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); +          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +          LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");            return;          } +        *POIter = L->getPointerOperand(); +        ++POIter;        } -      // Check if the loads are consecutive, reversed, or neither. -      // TODO: What we really want is to sort the loads, but for now, check -      // the two likely directions. -      bool Consecutive = true; -      bool ReverseConsecutive = true; -      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { -        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { -          Consecutive = false; -          break; +      OrdersType CurrentOrder; +      // Check the order of pointer operands. +      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { +        Value *Ptr0; +        Value *PtrN; +        if (CurrentOrder.empty()) { +          Ptr0 = PointerOps.front(); +          PtrN = PointerOps.back();          } else { -          ReverseConsecutive = false; +          Ptr0 = PointerOps[CurrentOrder.front()]; +          PtrN = PointerOps[CurrentOrder.back()];          } -      } - -      if (Consecutive) { -        ++NumLoadsWantToKeepOrder; -        newTreeEntry(VL, true, UserTreeIdx); -        DEBUG(dbgs() << "SLP: added a vector of loads.\n"); -        return; -      } - -      // If none of the load pairs were consecutive when checked in order, -      // check the reverse order. -      if (ReverseConsecutive) -        for (unsigned i = VL.size() - 1; i > 0; --i) -          if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) { -            ReverseConsecutive = false; -            break; +        const SCEV *Scev0 = SE->getSCEV(Ptr0); +        const SCEV *ScevN = SE->getSCEV(PtrN); +        const auto *Diff = +            dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); +        uint64_t Size = DL->getTypeAllocSize(ScalarTy); +        // Check that the sorted loads are consecutive. +        if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) { +          if (CurrentOrder.empty()) { +            // Original loads are consecutive and does not require reordering. +            ++NumOpsWantToKeepOriginalOrder; +            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, +                         ReuseShuffleIndicies); +            LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); +          } else { +            // Need to reorder. +            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; +            ++I->getSecond(); +            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, +                         ReuseShuffleIndicies, I->getFirst()); +            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");            } +          return; +        } +      } +      LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");        BS.cancelScheduling(VL, VL0); -      newTreeEntry(VL, false, UserTreeIdx); - -      if (ReverseConsecutive) { -        ++NumLoadsWantToChangeOrder; -        DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); -      } else { -        DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); -      } +      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);        return;      }      case Instruction::ZExt: @@ -1637,13 +1687,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();          if (Ty != SrcTy || !isValidElementType(Ty)) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); -          DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); +          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +          LLVM_DEBUG(dbgs() +                     << "SLP: Gathering casts with different src types.\n");            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx); -      DEBUG(dbgs() << "SLP: added a vector of casts.\n"); +      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); +      LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {          ValueList Operands; @@ -1665,14 +1716,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          if (Cmp->getPredicate() != P0 ||              Cmp->getOperand(0)->getType() != ComparedTy) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); -          DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); +          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +          LLVM_DEBUG(dbgs() +                     << "SLP: Gathering cmp with different predicate.\n");            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx); -      DEBUG(dbgs() << "SLP: added a vector of compares.\n"); +      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); +      LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {          ValueList Operands; @@ -1703,14 +1755,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      case Instruction::And:      case Instruction::Or:      case Instruction::Xor: -      newTreeEntry(VL, true, UserTreeIdx); -      DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); +      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); +      LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");        // Sort operands of the instructions so that each side is more likely to        // have the same opcode.        if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {          ValueList Left, Right; -        reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right); +        reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);          buildTree_rec(Left, Depth + 1, UserTreeIdx);          buildTree_rec(Right, Depth + 1, UserTreeIdx);          return; @@ -1730,9 +1782,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        // We don't combine GEPs with complicated (nested) indexing.        for (unsigned j = 0; j < VL.size(); ++j) {          if (cast<Instruction>(VL[j])->getNumOperands() != 2) { -          DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); +          LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);            return;          }        } @@ -1743,9 +1795,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        for (unsigned j = 0; j < VL.size(); ++j) {          Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();          if (Ty0 != CurTy) { -          DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); +          LLVM_DEBUG(dbgs() +                     << "SLP: not-vectorizable GEP (different types).\n");            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);            return;          }        } @@ -1754,16 +1807,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        for (unsigned j = 0; j < VL.size(); ++j) {          auto Op = cast<Instruction>(VL[j])->getOperand(1);          if (!isa<ConstantInt>(Op)) { -          DEBUG( -              dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); +          LLVM_DEBUG(dbgs() +                     << "SLP: not-vectorizable GEP (non-constant indexes).\n");            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); +          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx); -      DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); +      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); +      LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");        for (unsigned i = 0, e = 2; i < e; ++i) {          ValueList Operands;          // Prepare the operand vector. @@ -1779,13 +1832,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)          if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); -          DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); +          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +          LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");            return;          } -      newTreeEntry(VL, true, UserTreeIdx); -      DEBUG(dbgs() << "SLP: added a vector of stores.\n"); +      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); +      LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");        ValueList Operands;        for (Value *j : VL) @@ -1802,8 +1855,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);        if (!isTriviallyVectorizable(ID)) {          BS.cancelScheduling(VL, VL0); -        newTreeEntry(VL, false, UserTreeIdx); -        DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); +        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +        LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");          return;        }        Function *Int = CI->getCalledFunction(); @@ -1816,9 +1869,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,              getVectorIntrinsicIDForCall(CI2, TLI) != ID ||              !CI->hasIdenticalOperandBundleSchema(*CI2)) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); -          DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] -                       << "\n"); +          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] +                            << "\n");            return;          }          // ctlz,cttz and powi are special intrinsics whose second argument @@ -1827,10 +1880,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,            Value *A1J = CI2->getArgOperand(1);            if (A1I != A1J) {              BS.cancelScheduling(VL, VL0); -            newTreeEntry(VL, false, UserTreeIdx); -            DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI -                         << " argument "<< A1I<<"!=" << A1J -                         << "\n"); +            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +            LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI +                              << " argument " << A1I << "!=" << A1J << "\n");              return;            }          } @@ -1840,14 +1892,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,                          CI->op_begin() + CI->getBundleOperandsEndIndex(),                          CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {            BS.cancelScheduling(VL, VL0); -          newTreeEntry(VL, false, UserTreeIdx); -          DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" -                       << *VL[i] << '\n'); +          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +          LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" +                            << *CI << "!=" << *VL[i] << '\n');            return;          }        } -      newTreeEntry(VL, true, UserTreeIdx); +      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);        for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {          ValueList Operands;          // Prepare the operand vector. @@ -1862,19 +1914,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      case Instruction::ShuffleVector:        // If this is not an alternate sequence of opcode like add-sub        // then do not vectorize this instruction. -      if (!S.IsAltShuffle) { +      if (!S.isAltShuffle()) {          BS.cancelScheduling(VL, VL0); -        newTreeEntry(VL, false, UserTreeIdx); -        DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); +        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +        LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");          return;        } -      newTreeEntry(VL, true, UserTreeIdx); -      DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); +      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); +      LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");        // Reorder operands if reordering would enable vectorization.        if (isa<BinaryOperator>(VL0)) {          ValueList Left, Right; -        reorderAltShuffleOperands(S.Opcode, VL, Left, Right); +        reorderAltShuffleOperands(S, VL, Left, Right);          buildTree_rec(Left, Depth + 1, UserTreeIdx);          buildTree_rec(Right, Depth + 1, UserTreeIdx);          return; @@ -1892,8 +1944,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,      default:        BS.cancelScheduling(VL, VL0); -      newTreeEntry(VL, false, UserTreeIdx); -      DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); +      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); +      LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");        return;    }  } @@ -1923,15 +1975,18 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {    return N;  } -bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const { +bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, +                              SmallVectorImpl<unsigned> &CurrentOrder) const {    Instruction *E0 = cast<Instruction>(OpValue);    assert(E0->getOpcode() == Instruction::ExtractElement ||           E0->getOpcode() == Instruction::ExtractValue); -  assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode"); +  assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");    // Check if all of the extracts come from the same vector and from the    // correct offset.    Value *Vec = E0->getOperand(0); +  CurrentOrder.clear(); +    // We have to extract from a vector/aggregate with the same number of elements.    unsigned NElts;    if (E0->getOpcode() == Instruction::ExtractValue) { @@ -1951,15 +2006,40 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {      return false;    // Check that all of the indices extract from the correct offset. -  for (unsigned I = 0, E = VL.size(); I < E; ++I) { -    Instruction *Inst = cast<Instruction>(VL[I]); -    if (!matchExtractIndex(Inst, I, Inst->getOpcode())) -      return false; +  bool ShouldKeepOrder = true; +  unsigned E = VL.size(); +  // Assign to all items the initial value E + 1 so we can check if the extract +  // instruction index was used already. +  // Also, later we can check that all the indices are used and we have a +  // consecutive access in the extract instructions, by checking that no +  // element of CurrentOrder still has value E + 1. +  CurrentOrder.assign(E, E + 1); +  unsigned I = 0; +  for (; I < E; ++I) { +    auto *Inst = cast<Instruction>(VL[I]);      if (Inst->getOperand(0) != Vec) -      return false; +      break; +    Optional<unsigned> Idx = getExtractIndex(Inst); +    if (!Idx) +      break; +    const unsigned ExtIdx = *Idx; +    if (ExtIdx != I) { +      if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) +        break; +      ShouldKeepOrder = false; +      CurrentOrder[ExtIdx] = I; +    } else { +      if (CurrentOrder[I] != E + 1) +        break; +      CurrentOrder[I] = I; +    } +  } +  if (I < E) { +    CurrentOrder.clear(); +    return false;    } -  return true; +  return ShouldKeepOrder;  }  bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { @@ -1985,13 +2065,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {      VecTy = VectorType::get(          IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); +  unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); +  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); +  int ReuseShuffleCost = 0; +  if (NeedToShuffleReuses) { +    ReuseShuffleCost = +        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); +  }    if (E->NeedToGather) {      if (allConstant(VL))        return 0;      if (isSplat(VL)) { -      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); +      return ReuseShuffleCost + +             TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);      } -    if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) { +    if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement && +        allSameType(VL) && allSameBlock(VL)) {        Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);        if (ShuffleKind.hasValue()) {          int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); @@ -2008,37 +2097,86 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {                                              IO->getZExtValue());            }          } -        return Cost; +        return ReuseShuffleCost + Cost;        }      } -    return getGatherCost(E->Scalars); +    return ReuseShuffleCost + getGatherCost(VL);    }    InstructionsState S = getSameOpcode(VL); -  assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); +  assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");    Instruction *VL0 = cast<Instruction>(S.OpValue); -  unsigned ShuffleOrOp = S.IsAltShuffle ? -               (unsigned) Instruction::ShuffleVector : S.Opcode; +  unsigned ShuffleOrOp = S.isAltShuffle() ? +               (unsigned) Instruction::ShuffleVector : S.getOpcode();    switch (ShuffleOrOp) {      case Instruction::PHI:        return 0;      case Instruction::ExtractValue:      case Instruction::ExtractElement: -      if (canReuseExtract(VL, S.OpValue)) { -        int DeadCost = 0; +      if (NeedToShuffleReuses) { +        unsigned Idx = 0; +        for (unsigned I : E->ReuseShuffleIndices) { +          if (ShuffleOrOp == Instruction::ExtractElement) { +            auto *IO = cast<ConstantInt>( +                cast<ExtractElementInst>(VL[I])->getIndexOperand()); +            Idx = IO->getZExtValue(); +            ReuseShuffleCost -= TTI->getVectorInstrCost( +                Instruction::ExtractElement, VecTy, Idx); +          } else { +            ReuseShuffleCost -= TTI->getVectorInstrCost( +                Instruction::ExtractElement, VecTy, Idx); +            ++Idx; +          } +        } +        Idx = ReuseShuffleNumbers; +        for (Value *V : VL) { +          if (ShuffleOrOp == Instruction::ExtractElement) { +            auto *IO = cast<ConstantInt>( +                cast<ExtractElementInst>(V)->getIndexOperand()); +            Idx = IO->getZExtValue(); +          } else { +            --Idx; +          } +          ReuseShuffleCost += +              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); +        } +      } +      if (!E->NeedToGather) { +        int DeadCost = ReuseShuffleCost; +        if (!E->ReorderIndices.empty()) { +          // TODO: Merge this shuffle with the ReuseShuffleCost. +          DeadCost += TTI->getShuffleCost( +              TargetTransformInfo::SK_PermuteSingleSrc, VecTy); +        }          for (unsigned i = 0, e = VL.size(); i < e; ++i) {            Instruction *E = cast<Instruction>(VL[i]);            // If all users are going to be vectorized, instruction can be            // considered as dead.            // The same, if have only one user, it will be vectorized for sure. -          if (areAllUsersVectorized(E)) +          if (areAllUsersVectorized(E)) {              // Take credit for instruction that will become dead. -            DeadCost += +            if (E->hasOneUse()) { +              Instruction *Ext = E->user_back(); +              if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && +                  all_of(Ext->users(), +                         [](User *U) { return isa<GetElementPtrInst>(U); })) { +                // Use getExtractWithExtendCost() to calculate the cost of +                // extractelement/ext pair. +                DeadCost -= TTI->getExtractWithExtendCost( +                    Ext->getOpcode(), Ext->getType(), VecTy, i); +                // Add back the cost of s|zext which is subtracted seperately. +                DeadCost += TTI->getCastInstrCost( +                    Ext->getOpcode(), Ext->getType(), E->getType(), Ext); +                continue; +              } +            } +            DeadCost -=                  TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); +          }          } -        return -DeadCost; +        return DeadCost;        } -      return getGatherCost(VecTy); +      return ReuseShuffleCost + getGatherCost(VL);      case Instruction::ZExt:      case Instruction::SExt: @@ -2053,24 +2191,37 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {      case Instruction::FPTrunc:      case Instruction::BitCast: {        Type *SrcTy = VL0->getOperand(0)->getType(); +      int ScalarEltCost = +          TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); +      if (NeedToShuffleReuses) { +        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; +      }        // Calculate the cost of this instruction. -      int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), -                                                         VL0->getType(), SrcTy, VL0); +      int ScalarCost = VL.size() * ScalarEltCost;        VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); -      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); +      int VecCost = 0; +      // Check if the values are candidates to demote. +      if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { +        VecCost = ReuseShuffleCost + +                  TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0); +      }        return VecCost - ScalarCost;      }      case Instruction::FCmp:      case Instruction::ICmp:      case Instruction::Select: {        // Calculate the cost of this instruction. +      int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, +                                                  Builder.getInt1Ty(), VL0); +      if (NeedToShuffleReuses) { +        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; +      }        VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); -      int ScalarCost = VecTy->getNumElements() * -          TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0); -      int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0); -      return VecCost - ScalarCost; +      int ScalarCost = VecTy->getNumElements() * ScalarEltCost; +      int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); +      return ReuseShuffleCost + VecCost - ScalarCost;      }      case Instruction::Add:      case Instruction::FAdd: @@ -2099,42 +2250,43 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {        TargetTransformInfo::OperandValueProperties Op1VP =            TargetTransformInfo::OP_None;        TargetTransformInfo::OperandValueProperties Op2VP = -          TargetTransformInfo::OP_None; +          TargetTransformInfo::OP_PowerOf2;        // If all operands are exactly the same ConstantInt then set the        // operand kind to OK_UniformConstantValue.        // If instead not all operands are constants, then set the operand kind        // to OK_AnyValue. If all operands are constants but not the same,        // then set the operand kind to OK_NonUniformConstantValue. -      ConstantInt *CInt = nullptr; -      for (unsigned i = 0; i < VL.size(); ++i) { +      ConstantInt *CInt0 = nullptr; +      for (unsigned i = 0, e = VL.size(); i < e; ++i) {          const Instruction *I = cast<Instruction>(VL[i]); -        if (!isa<ConstantInt>(I->getOperand(1))) { +        ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1)); +        if (!CInt) {            Op2VK = TargetTransformInfo::OK_AnyValue; +          Op2VP = TargetTransformInfo::OP_None;            break;          } +        if (Op2VP == TargetTransformInfo::OP_PowerOf2 && +            !CInt->getValue().isPowerOf2()) +          Op2VP = TargetTransformInfo::OP_None;          if (i == 0) { -          CInt = cast<ConstantInt>(I->getOperand(1)); +          CInt0 = CInt;            continue;          } -        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && -            CInt != cast<ConstantInt>(I->getOperand(1))) +        if (CInt0 != CInt)            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;        } -      // FIXME: Currently cost of model modification for division by power of -      // 2 is handled for X86 and AArch64. Add support for other targets. -      if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && -          CInt->getValue().isPowerOf2()) -        Op2VP = TargetTransformInfo::OP_PowerOf2;        SmallVector<const Value *, 4> Operands(VL0->operand_values()); -      int ScalarCost = -          VecTy->getNumElements() * -          TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, -                                      Op2VP, Operands); -      int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK, -                                                Op1VP, Op2VP, Operands); -      return VecCost - ScalarCost; +      int ScalarEltCost = TTI->getArithmeticInstrCost( +          S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); +      if (NeedToShuffleReuses) { +        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; +      } +      int ScalarCost = VecTy->getNumElements() * ScalarEltCost; +      int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, +                                                Op2VK, Op1VP, Op2VP, Operands); +      return ReuseShuffleCost + VecCost - ScalarCost;      }      case Instruction::GetElementPtr: {        TargetTransformInfo::OperandValueKind Op1VK = @@ -2142,83 +2294,119 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {        TargetTransformInfo::OperandValueKind Op2VK =            TargetTransformInfo::OK_UniformConstantValue; -      int ScalarCost = -          VecTy->getNumElements() * +      int ScalarEltCost =            TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); +      if (NeedToShuffleReuses) { +        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; +      } +      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;        int VecCost =            TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); - -      return VecCost - ScalarCost; +      return ReuseShuffleCost + VecCost - ScalarCost;      }      case Instruction::Load: {        // Cost of wide load - cost of scalar loads. -      unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment(); -      int ScalarLdCost = VecTy->getNumElements() * +      unsigned alignment = cast<LoadInst>(VL0)->getAlignment(); +      int ScalarEltCost =            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); -      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, -                                           VecTy, alignment, 0, VL0); -      return VecLdCost - ScalarLdCost; +      if (NeedToShuffleReuses) { +        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; +      } +      int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; +      int VecLdCost = +          TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); +      if (!E->ReorderIndices.empty()) { +        // TODO: Merge this shuffle with the ReuseShuffleCost. +        VecLdCost += TTI->getShuffleCost( +            TargetTransformInfo::SK_PermuteSingleSrc, VecTy); +      } +      return ReuseShuffleCost + VecLdCost - ScalarLdCost;      }      case Instruction::Store: {        // We know that we can merge the stores. Calculate the cost. -      unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment(); -      int ScalarStCost = VecTy->getNumElements() * +      unsigned alignment = cast<StoreInst>(VL0)->getAlignment(); +      int ScalarEltCost =            TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); -      int VecStCost = TTI->getMemoryOpCost(Instruction::Store, -                                           VecTy, alignment, 0, VL0); -      return VecStCost - ScalarStCost; +      if (NeedToShuffleReuses) { +        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; +      } +      int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; +      int VecStCost = +          TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); +      return ReuseShuffleCost + VecStCost - ScalarStCost;      }      case Instruction::Call: {        CallInst *CI = cast<CallInst>(VL0);        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);        // Calculate the cost of the scalar and vector calls. -      SmallVector<Type*, 4> ScalarTys; -      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) +      SmallVector<Type *, 4> ScalarTys; +      for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)          ScalarTys.push_back(CI->getArgOperand(op)->getType());        FastMathFlags FMF;        if (auto *FPMO = dyn_cast<FPMathOperator>(CI))          FMF = FPMO->getFastMathFlags(); -      int ScalarCallCost = VecTy->getNumElements() * +      int ScalarEltCost =            TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); +      if (NeedToShuffleReuses) { +        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; +      } +      int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;        SmallVector<Value *, 4> Args(CI->arg_operands());        int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,                                                     VecTy->getNumElements()); -      DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost -            << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")" -            << " for " << *CI << "\n"); +      LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost +                        << " (" << VecCallCost << "-" << ScalarCallCost << ")" +                        << " for " << *CI << "\n"); -      return VecCallCost - ScalarCallCost; +      return ReuseShuffleCost + VecCallCost - ScalarCallCost;      }      case Instruction::ShuffleVector: { -      TargetTransformInfo::OperandValueKind Op1VK = -          TargetTransformInfo::OK_AnyValue; -      TargetTransformInfo::OperandValueKind Op2VK = -          TargetTransformInfo::OK_AnyValue; +      assert(S.isAltShuffle() && +             ((Instruction::isBinaryOp(S.getOpcode()) && +               Instruction::isBinaryOp(S.getAltOpcode())) || +              (Instruction::isCast(S.getOpcode()) && +               Instruction::isCast(S.getAltOpcode()))) && +             "Invalid Shuffle Vector Operand");        int ScalarCost = 0; -      int VecCost = 0; +      if (NeedToShuffleReuses) { +        for (unsigned Idx : E->ReuseShuffleIndices) { +          Instruction *I = cast<Instruction>(VL[Idx]); +          ReuseShuffleCost -= TTI->getInstructionCost( +              I, TargetTransformInfo::TCK_RecipThroughput); +        } +        for (Value *V : VL) { +          Instruction *I = cast<Instruction>(V); +          ReuseShuffleCost += TTI->getInstructionCost( +              I, TargetTransformInfo::TCK_RecipThroughput); +        } +      }        for (Value *i : VL) {          Instruction *I = cast<Instruction>(i); -        if (!I) -          break; -        ScalarCost += -            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); +        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); +        ScalarCost += TTI->getInstructionCost( +            I, TargetTransformInfo::TCK_RecipThroughput);        }        // VecCost is equal to sum of the cost of creating 2 vectors        // and the cost of creating shuffle. -      Instruction *I0 = cast<Instruction>(VL[0]); -      VecCost = -          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); -      Instruction *I1 = cast<Instruction>(VL[1]); -      VecCost += -          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); -      VecCost += -          TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); -      return VecCost - ScalarCost; +      int VecCost = 0; +      if (Instruction::isBinaryOp(S.getOpcode())) { +        VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); +        VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); +      } else { +        Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); +        Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); +        VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); +        VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); +        VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); +        VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); +      } +      VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); +      return ReuseShuffleCost + VecCost - ScalarCost;      }      default:        llvm_unreachable("Unknown instruction"); @@ -2226,8 +2414,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {  }  bool BoUpSLP::isFullyVectorizableTinyTree() { -  DEBUG(dbgs() << "SLP: Check whether the tree with height " << -        VectorizableTree.size() << " is fully vectorizable .\n"); +  LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " +                    << VectorizableTree.size() << " is fully vectorizable .\n");    // We only handle trees of heights 1 and 2.    if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather) @@ -2297,13 +2485,13 @@ int BoUpSLP::getSpillCost() {          LiveValues.insert(cast<Instruction>(&*J));      } -    DEBUG( +    LLVM_DEBUG({        dbgs() << "SLP: #LV: " << LiveValues.size();        for (auto *X : LiveValues)          dbgs() << " " << X->getName();        dbgs() << ", Looking at ";        Inst->dump(); -      ); +    });      // Now find the sequence of instructions between PrevInst and Inst.      BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), @@ -2315,7 +2503,10 @@ int BoUpSLP::getSpillCost() {          continue;        } -      if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) { +      // Debug informations don't impact spill cost. +      if ((isa<CallInst>(&*PrevInstIt) && +           !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && +          &*PrevInstIt != PrevInst) {          SmallVector<Type*, 4> V;          for (auto *II : LiveValues)            V.push_back(VectorType::get(II->getType(), BundleWidth)); @@ -2333,19 +2524,41 @@ int BoUpSLP::getSpillCost() {  int BoUpSLP::getTreeCost() {    int Cost = 0; -  DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << -        VectorizableTree.size() << ".\n"); +  LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " +                    << VectorizableTree.size() << ".\n");    unsigned BundleWidth = VectorizableTree[0].Scalars.size(); -  for (TreeEntry &TE : VectorizableTree) { +  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { +    TreeEntry &TE = VectorizableTree[I]; + +    // We create duplicate tree entries for gather sequences that have multiple +    // uses. However, we should not compute the cost of duplicate sequences. +    // For example, if we have a build vector (i.e., insertelement sequence) +    // that is used by more than one vector instruction, we only need to +    // compute the cost of the insertelement instructions once. The redundent +    // instructions will be eliminated by CSE. +    // +    // We should consider not creating duplicate tree entries for gather +    // sequences, and instead add additional edges to the tree representing +    // their uses. Since such an approach results in fewer total entries, +    // existing heuristics based on tree size may yeild different results. +    // +    if (TE.NeedToGather && +        std::any_of(std::next(VectorizableTree.begin(), I + 1), +                    VectorizableTree.end(), [TE](TreeEntry &Entry) { +                      return Entry.NeedToGather && Entry.isSame(TE.Scalars); +                    })) +      continue; +      int C = getEntryCost(&TE); -    DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " -                 << *TE.Scalars[0] << ".\n"); +    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C +                      << " for bundle that starts with " << *TE.Scalars[0] +                      << ".\n");      Cost += C;    } -  SmallSet<Value *, 16> ExtractCostCalculated; +  SmallPtrSet<Value *, 16> ExtractCostCalculated;    int ExtractCost = 0;    for (ExternalUser &EU : ExternalUses) {      // We only add extract cost once for the same scalar. @@ -2386,7 +2599,7 @@ int BoUpSLP::getTreeCost() {         << "SLP: Extract Cost = " << ExtractCost << ".\n"         << "SLP: Total Cost = " << Cost << ".\n";    } -  DEBUG(dbgs() << Str); +  LLVM_DEBUG(dbgs() << Str);    if (ViewSLPTree)      ViewGraph(this, "SLP" + F->getName(), false, Str); @@ -2394,10 +2607,14 @@ int BoUpSLP::getTreeCost() {    return Cost;  } -int BoUpSLP::getGatherCost(Type *Ty) { +int BoUpSLP::getGatherCost(Type *Ty, +                           const DenseSet<unsigned> &ShuffledIndices) {    int Cost = 0;    for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) -    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); +    if (!ShuffledIndices.count(i)) +      Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); +  if (!ShuffledIndices.empty()) +      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);    return Cost;  } @@ -2408,7 +2625,17 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {      ScalarTy = SI->getValueOperand()->getType();    VectorType *VecTy = VectorType::get(ScalarTy, VL.size());    // Find the cost of inserting/extracting values from the vector. -  return getGatherCost(VecTy); +  // Check if the same elements are inserted several times and count them as +  // shuffle candidates. +  DenseSet<unsigned> ShuffledElements; +  DenseSet<Value *> UniqueElements; +  // Iterate in reverse order to consider insert elements with the high cost. +  for (unsigned I = VL.size(); I > 0; --I) { +    unsigned Idx = I - 1; +    if (!UniqueElements.insert(VL[Idx]).second) +      ShuffledElements.insert(Idx); +  } +  return getGatherCost(VecTy, ShuffledElements);  }  // Reorder commutative operations in alternate shuffle if the resulting vectors @@ -2420,16 +2647,14 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {  // load a[3] + load b[3]  // Reordering the second load b[1]  load a[1] would allow us to vectorize this  // code. -void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL, +void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S, +                                        ArrayRef<Value *> VL,                                          SmallVectorImpl<Value *> &Left,                                          SmallVectorImpl<Value *> &Right) {    // Push left and right operands of binary operation into Left and Right -  unsigned AltOpcode = getAltOpcode(Opcode); -  (void)AltOpcode;    for (Value *V : VL) {      auto *I = cast<Instruction>(V); -    assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && -           "Incorrect instruction in vector"); +    assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");      Left.push_back(I->getOperand(0));      Right.push_back(I->getOperand(1));    } @@ -2609,7 +2834,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,    // add a[1],c[2]  load b[1]    // b[2]           load b[2]    // add a[3],c[3]  load b[3] -  for (unsigned j = 0; j < VL.size() - 1; ++j) { +  for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {      if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {        if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {          if (isConsecutiveAccess(L, L1, *DL, *SE)) { @@ -2630,17 +2855,15 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,    }  } -void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) { +void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, +                                        const InstructionsState &S) {    // Get the basic block this bundle is in. All instructions in the bundle    // should be in this block. -  auto *Front = cast<Instruction>(OpValue); +  auto *Front = cast<Instruction>(S.OpValue);    auto *BB = Front->getParent(); -  const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode(); -  const unsigned AltOpcode = getAltOpcode(Opcode);    assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { -    return !sameOpcodeOrAlt(Opcode, AltOpcode, -                            cast<Instruction>(V)->getOpcode()) || -           cast<Instruction>(V)->getParent() == BB; +    auto *I = cast<Instruction>(V); +    return !S.isOpcodeOrAlt(I) || I->getParent() == BB;    }));    // The last instruction in the bundle in program order. @@ -2652,7 +2875,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {    // bundle. The end of the bundle is marked by null ScheduleData.    if (BlocksSchedules.count(BB)) {      auto *Bundle = -        BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back())); +        BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));      if (Bundle && Bundle->isPartOfBundle())        for (; Bundle; Bundle = Bundle->NextInBundle)          if (Bundle->OpValue == Bundle->Inst) @@ -2680,7 +2903,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {    if (!LastInst) {      SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());      for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { -      if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode())) +      if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))          LastInst = &I;        if (Bundle.empty())          break; @@ -2706,7 +2929,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {        if (TreeEntry *E = getTreeEntry(VL[i])) {          // Find which lane we need to extract.          int FoundLane = -1; -        for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { +        for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {            // Is this the lane of the scalar that we are looking for ?            if (E->Scalars[Lane] == VL[i]) {              FoundLane = Lane; @@ -2714,6 +2937,11 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {            }          }          assert(FoundLane >= 0 && "Could not find the correct lane"); +        if (!E->ReuseShuffleIndices.empty()) { +          FoundLane = +              std::distance(E->ReuseShuffleIndices.begin(), +                            llvm::find(E->ReuseShuffleIndices, FoundLane)); +        }          ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));        }      } @@ -2722,66 +2950,128 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {    return Vec;  } -Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const { -  if (const TreeEntry *En = getTreeEntry(OpValue)) { -    if (En->isSame(VL) && En->VectorizedValue) -      return En->VectorizedValue; -  } -  return nullptr; -} -  Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {    InstructionsState S = getSameOpcode(VL); -  if (S.Opcode) { +  if (S.getOpcode()) {      if (TreeEntry *E = getTreeEntry(S.OpValue)) { -      if (E->isSame(VL)) -        return vectorizeTree(E); +      if (E->isSame(VL)) { +        Value *V = vectorizeTree(E); +        if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { +          // We need to get the vectorized value but without shuffle. +          if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) { +            V = SV->getOperand(0); +          } else { +            // Reshuffle to get only unique values. +            SmallVector<unsigned, 4> UniqueIdxs; +            SmallSet<unsigned, 4> UsedIdxs; +            for(unsigned Idx : E->ReuseShuffleIndices) +              if (UsedIdxs.insert(Idx).second) +                UniqueIdxs.emplace_back(Idx); +            V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), +                                            UniqueIdxs); +          } +        } +        return V; +      }      }    }    Type *ScalarTy = S.OpValue->getType();    if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))      ScalarTy = SI->getValueOperand()->getType(); + +  // Check that every instruction appears once in this bundle. +  SmallVector<unsigned, 4> ReuseShuffleIndicies; +  SmallVector<Value *, 4> UniqueValues; +  if (VL.size() > 2) { +    DenseMap<Value *, unsigned> UniquePositions; +    for (Value *V : VL) { +      auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); +      ReuseShuffleIndicies.emplace_back(Res.first->second); +      if (Res.second || isa<Constant>(V)) +        UniqueValues.emplace_back(V); +    } +    // Do not shuffle single element or if number of unique values is not power +    // of 2. +    if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 || +        !llvm::isPowerOf2_32(UniqueValues.size())) +      ReuseShuffleIndicies.clear(); +    else +      VL = UniqueValues; +  }    VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); -  return Gather(VL, VecTy); +  Value *V = Gather(VL, VecTy); +  if (!ReuseShuffleIndicies.empty()) { +    V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                    ReuseShuffleIndicies, "shuffle"); +    if (auto *I = dyn_cast<Instruction>(V)) { +      GatherSeq.insert(I); +      CSEBlocks.insert(I->getParent()); +    } +  } +  return V; +} + +static void inversePermutation(ArrayRef<unsigned> Indices, +                               SmallVectorImpl<unsigned> &Mask) { +  Mask.clear(); +  const unsigned E = Indices.size(); +  Mask.resize(E); +  for (unsigned I = 0; I < E; ++I) +    Mask[Indices[I]] = I;  }  Value *BoUpSLP::vectorizeTree(TreeEntry *E) {    IRBuilder<>::InsertPointGuard Guard(Builder);    if (E->VectorizedValue) { -    DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); +    LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");      return E->VectorizedValue;    }    InstructionsState S = getSameOpcode(E->Scalars); -  Instruction *VL0 = cast<Instruction>(E->Scalars[0]); +  Instruction *VL0 = cast<Instruction>(S.OpValue);    Type *ScalarTy = VL0->getType();    if (StoreInst *SI = dyn_cast<StoreInst>(VL0))      ScalarTy = SI->getValueOperand()->getType();    VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); +  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); +    if (E->NeedToGather) { -    setInsertPointAfterBundle(E->Scalars, VL0); +    setInsertPointAfterBundle(E->Scalars, S);      auto *V = Gather(E->Scalars, VecTy); +    if (NeedToShuffleReuses) { +      V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                      E->ReuseShuffleIndices, "shuffle"); +      if (auto *I = dyn_cast<Instruction>(V)) { +        GatherSeq.insert(I); +        CSEBlocks.insert(I->getParent()); +      } +    }      E->VectorizedValue = V;      return V;    } -  unsigned ShuffleOrOp = S.IsAltShuffle ? -           (unsigned) Instruction::ShuffleVector : S.Opcode; +  unsigned ShuffleOrOp = S.isAltShuffle() ? +           (unsigned) Instruction::ShuffleVector : S.getOpcode();    switch (ShuffleOrOp) {      case Instruction::PHI: {        PHINode *PH = dyn_cast<PHINode>(VL0);        Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());        Builder.SetCurrentDebugLocation(PH->getDebugLoc());        PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); -      E->VectorizedValue = NewPhi; +      Value *V = NewPhi; +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +      } +      E->VectorizedValue = V;        // PHINodes may have multiple entries from the same block. We want to        // visit every block once. -      SmallSet<BasicBlock*, 4> VisitedBBs; +      SmallPtrSet<BasicBlock*, 4> VisitedBBs;        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {          ValueList Operands; @@ -2804,32 +3094,74 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {        assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&               "Invalid number of incoming values"); -      return NewPhi; +      return V;      }      case Instruction::ExtractElement: { -      if (canReuseExtract(E->Scalars, VL0)) { +      if (!E->NeedToGather) {          Value *V = VL0->getOperand(0); +        if (!E->ReorderIndices.empty()) { +          OrdersType Mask; +          inversePermutation(E->ReorderIndices, Mask); +          Builder.SetInsertPoint(VL0); +          V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, +                                          "reorder_shuffle"); +        } +        if (NeedToShuffleReuses) { +          // TODO: Merge this shuffle with the ReorderShuffleMask. +          if (!E->ReorderIndices.empty()) +            Builder.SetInsertPoint(VL0); +          V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                          E->ReuseShuffleIndices, "shuffle"); +        }          E->VectorizedValue = V;          return V;        } -      setInsertPointAfterBundle(E->Scalars, VL0); +      setInsertPointAfterBundle(E->Scalars, S);        auto *V = Gather(E->Scalars, VecTy); +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +        if (auto *I = dyn_cast<Instruction>(V)) { +          GatherSeq.insert(I); +          CSEBlocks.insert(I->getParent()); +        } +      }        E->VectorizedValue = V;        return V;      }      case Instruction::ExtractValue: { -      if (canReuseExtract(E->Scalars, VL0)) { +      if (!E->NeedToGather) {          LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));          Builder.SetInsertPoint(LI);          PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());          Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);          LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); -        E->VectorizedValue = V; -        return propagateMetadata(V, E->Scalars); +        Value *NewV = propagateMetadata(V, E->Scalars); +        if (!E->ReorderIndices.empty()) { +          OrdersType Mask; +          inversePermutation(E->ReorderIndices, Mask); +          NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, +                                             "reorder_shuffle"); +        } +        if (NeedToShuffleReuses) { +          // TODO: Merge this shuffle with the ReorderShuffleMask. +          NewV = Builder.CreateShuffleVector( +              NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); +        } +        E->VectorizedValue = NewV; +        return NewV;        } -      setInsertPointAfterBundle(E->Scalars, VL0); +      setInsertPointAfterBundle(E->Scalars, S);        auto *V = Gather(E->Scalars, VecTy); +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +        if (auto *I = dyn_cast<Instruction>(V)) { +          GatherSeq.insert(I); +          CSEBlocks.insert(I->getParent()); +        } +      }        E->VectorizedValue = V;        return V;      } @@ -2849,15 +3181,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {        for (Value *V : E->Scalars)          INVL.push_back(cast<Instruction>(V)->getOperand(0)); -      setInsertPointAfterBundle(E->Scalars, VL0); +      setInsertPointAfterBundle(E->Scalars, S);        Value *InVec = vectorizeTree(INVL); -      if (Value *V = alreadyVectorized(E->Scalars, VL0)) -        return V; +      if (E->VectorizedValue) { +        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); +        return E->VectorizedValue; +      }        CastInst *CI = dyn_cast<CastInst>(VL0);        Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +      }        E->VectorizedValue = V;        ++NumVectorInstructions;        return V; @@ -2870,23 +3208,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          RHSV.push_back(cast<Instruction>(V)->getOperand(1));        } -      setInsertPointAfterBundle(E->Scalars, VL0); +      setInsertPointAfterBundle(E->Scalars, S);        Value *L = vectorizeTree(LHSV);        Value *R = vectorizeTree(RHSV); -      if (Value *V = alreadyVectorized(E->Scalars, VL0)) -        return V; +      if (E->VectorizedValue) { +        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); +        return E->VectorizedValue; +      }        CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();        Value *V; -      if (S.Opcode == Instruction::FCmp) +      if (S.getOpcode() == Instruction::FCmp)          V = Builder.CreateFCmp(P0, L, R);        else          V = Builder.CreateICmp(P0, L, R); +      propagateIRFlags(V, E->Scalars, VL0); +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +      }        E->VectorizedValue = V; -      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);        ++NumVectorInstructions;        return V;      } @@ -2898,16 +3242,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          FalseVec.push_back(cast<Instruction>(V)->getOperand(2));        } -      setInsertPointAfterBundle(E->Scalars, VL0); +      setInsertPointAfterBundle(E->Scalars, S);        Value *Cond = vectorizeTree(CondVec);        Value *True = vectorizeTree(TrueVec);        Value *False = vectorizeTree(FalseVec); -      if (Value *V = alreadyVectorized(E->Scalars, VL0)) -        return V; +      if (E->VectorizedValue) { +        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); +        return E->VectorizedValue; +      }        Value *V = Builder.CreateSelect(Cond, True, False); +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +      }        E->VectorizedValue = V;        ++NumVectorInstructions;        return V; @@ -2932,7 +3282,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {      case Instruction::Xor: {        ValueList LHSVL, RHSVL;        if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) -        reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL, +        reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,                                         RHSVL);        else          for (Value *V : E->Scalars) { @@ -2941,29 +3291,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {            RHSVL.push_back(I->getOperand(1));          } -      setInsertPointAfterBundle(E->Scalars, VL0); +      setInsertPointAfterBundle(E->Scalars, S);        Value *LHS = vectorizeTree(LHSVL);        Value *RHS = vectorizeTree(RHSVL); -      if (Value *V = alreadyVectorized(E->Scalars, VL0)) -        return V; +      if (E->VectorizedValue) { +        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); +        return E->VectorizedValue; +      }        Value *V = Builder.CreateBinOp( -          static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS); +          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS); +      propagateIRFlags(V, E->Scalars, VL0); +      if (auto *I = dyn_cast<Instruction>(V)) +        V = propagateMetadata(I, E->Scalars); + +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +      }        E->VectorizedValue = V; -      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);        ++NumVectorInstructions; -      if (Instruction *I = dyn_cast<Instruction>(V)) -        return propagateMetadata(I, E->Scalars); -        return V;      }      case Instruction::Load: {        // Loads are inserted at the head of the tree because we don't want to        // sink them all the way down past store instructions. -      setInsertPointAfterBundle(E->Scalars, VL0); +      bool IsReorder = !E->ReorderIndices.empty(); +      if (IsReorder) { +        S = getSameOpcode(E->Scalars, E->ReorderIndices.front()); +        VL0 = cast<Instruction>(S.OpValue); +      } +      setInsertPointAfterBundle(E->Scalars, S);        LoadInst *LI = cast<LoadInst>(VL0);        Type *ScalarLoadTy = LI->getType(); @@ -2985,9 +3346,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          Alignment = DL->getABITypeAlignment(ScalarLoadTy);        }        LI->setAlignment(Alignment); -      E->VectorizedValue = LI; +      Value *V = propagateMetadata(LI, E->Scalars); +      if (IsReorder) { +        OrdersType Mask; +        inversePermutation(E->ReorderIndices, Mask); +        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), +                                        Mask, "reorder_shuffle"); +      } +      if (NeedToShuffleReuses) { +        // TODO: Merge this shuffle with the ReorderShuffleMask. +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +      } +      E->VectorizedValue = V;        ++NumVectorInstructions; -      return propagateMetadata(LI, E->Scalars); +      return V;      }      case Instruction::Store: {        StoreInst *SI = cast<StoreInst>(VL0); @@ -2998,12 +3371,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {        for (Value *V : E->Scalars)          ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand()); -      setInsertPointAfterBundle(E->Scalars, VL0); +      setInsertPointAfterBundle(E->Scalars, S);        Value *VecValue = vectorizeTree(ScalarStoreValues);        Value *ScalarPtr = SI->getPointerOperand();        Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); -      StoreInst *S = Builder.CreateStore(VecValue, VecPtr); +      StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);        // The pointer operand uses an in-tree scalar, so add the new BitCast to        // ExternalUses to make sure that an extract will be generated in the @@ -3014,13 +3387,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {        if (!Alignment)          Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); -      S->setAlignment(Alignment); -      E->VectorizedValue = S; +      ST->setAlignment(Alignment); +      Value *V = propagateMetadata(ST, E->Scalars); +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +      } +      E->VectorizedValue = V;        ++NumVectorInstructions; -      return propagateMetadata(S, E->Scalars); +      return V;      }      case Instruction::GetElementPtr: { -      setInsertPointAfterBundle(E->Scalars, VL0); +      setInsertPointAfterBundle(E->Scalars, S);        ValueList Op0VL;        for (Value *V : E->Scalars) @@ -3041,17 +3419,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {        Value *V = Builder.CreateGEP(            cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); +      if (Instruction *I = dyn_cast<Instruction>(V)) +        V = propagateMetadata(I, E->Scalars); + +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +      }        E->VectorizedValue = V;        ++NumVectorInstructions; -      if (Instruction *I = dyn_cast<Instruction>(V)) -        return propagateMetadata(I, E->Scalars); -        return V;      }      case Instruction::Call: {        CallInst *CI = cast<CallInst>(VL0); -      setInsertPointAfterBundle(E->Scalars, VL0); +      setInsertPointAfterBundle(E->Scalars, S);        Function *FI;        Intrinsic::ID IID  = Intrinsic::not_intrinsic;        Value *ScalarArg = nullptr; @@ -3075,7 +3457,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {          }          Value *OpVec = vectorizeTree(OpVL); -        DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); +        LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");          OpVecs.push_back(OpVec);        } @@ -3093,58 +3475,87 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {        if (ScalarArg && getTreeEntry(ScalarArg))          ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); +      propagateIRFlags(V, E->Scalars, VL0); +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +      }        E->VectorizedValue = V; -      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);        ++NumVectorInstructions;        return V;      }      case Instruction::ShuffleVector: {        ValueList LHSVL, RHSVL; -      assert(Instruction::isBinaryOp(S.Opcode) && +      assert(S.isAltShuffle() && +             ((Instruction::isBinaryOp(S.getOpcode()) && +               Instruction::isBinaryOp(S.getAltOpcode())) || +              (Instruction::isCast(S.getOpcode()) && +               Instruction::isCast(S.getAltOpcode()))) &&               "Invalid Shuffle Vector Operand"); -      reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL); -      setInsertPointAfterBundle(E->Scalars, VL0); -      Value *LHS = vectorizeTree(LHSVL); -      Value *RHS = vectorizeTree(RHSVL); - -      if (Value *V = alreadyVectorized(E->Scalars, VL0)) -        return V; +      Value *LHS, *RHS; +      if (Instruction::isBinaryOp(S.getOpcode())) { +        reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL); +        setInsertPointAfterBundle(E->Scalars, S); +        LHS = vectorizeTree(LHSVL); +        RHS = vectorizeTree(RHSVL); +      } else { +        ValueList INVL; +        for (Value *V : E->Scalars) +          INVL.push_back(cast<Instruction>(V)->getOperand(0)); +        setInsertPointAfterBundle(E->Scalars, S); +        LHS = vectorizeTree(INVL); +      } -      // Create a vector of LHS op1 RHS -      Value *V0 = Builder.CreateBinOp( -          static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS); +      if (E->VectorizedValue) { +        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); +        return E->VectorizedValue; +      } -      unsigned AltOpcode = getAltOpcode(S.Opcode); -      // Create a vector of LHS op2 RHS -      Value *V1 = Builder.CreateBinOp( -          static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS); +      Value *V0, *V1; +      if (Instruction::isBinaryOp(S.getOpcode())) { +        V0 = Builder.CreateBinOp( +          static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS); +        V1 = Builder.CreateBinOp( +          static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS); +      } else { +        V0 = Builder.CreateCast( +            static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy); +        V1 = Builder.CreateCast( +            static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy); +      }        // Create shuffle to take alternate operations from the vector. -      // Also, gather up odd and even scalar ops to propagate IR flags to +      // Also, gather up main and alt scalar ops to propagate IR flags to        // each vector operation. -      ValueList OddScalars, EvenScalars; +      ValueList OpScalars, AltScalars;        unsigned e = E->Scalars.size();        SmallVector<Constant *, 8> Mask(e);        for (unsigned i = 0; i < e; ++i) { -        if (isOdd(i)) { +        auto *OpInst = cast<Instruction>(E->Scalars[i]); +        assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); +        if (OpInst->getOpcode() == S.getAltOpcode()) {            Mask[i] = Builder.getInt32(e + i); -          OddScalars.push_back(E->Scalars[i]); +          AltScalars.push_back(E->Scalars[i]);          } else {            Mask[i] = Builder.getInt32(i); -          EvenScalars.push_back(E->Scalars[i]); +          OpScalars.push_back(E->Scalars[i]);          }        }        Value *ShuffleMask = ConstantVector::get(Mask); -      propagateIRFlags(V0, EvenScalars); -      propagateIRFlags(V1, OddScalars); +      propagateIRFlags(V0, OpScalars); +      propagateIRFlags(V1, AltScalars);        Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); +      if (Instruction *I = dyn_cast<Instruction>(V)) +        V = propagateMetadata(I, E->Scalars); +      if (NeedToShuffleReuses) { +        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), +                                        E->ReuseShuffleIndices, "shuffle"); +      }        E->VectorizedValue = V;        ++NumVectorInstructions; -      if (Instruction *I = dyn_cast<Instruction>(V)) -        return propagateMetadata(I, E->Scalars);        return V;      } @@ -3183,7 +3594,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {      VectorizableTree[0].VectorizedValue = Trunc;    } -  DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); +  LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() +                    << " values .\n");    // If necessary, sign-extend or zero-extend ScalarRoot to the larger type    // specified by ScalarType. @@ -3260,7 +3672,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {          Ex = extend(ScalarRoot, Ex, Scalar->getType());          CSEBlocks.insert(cast<Instruction>(User)->getParent());          User->replaceUsesOfWith(Scalar, Ex); -     } +      }      } else {        Builder.SetInsertPoint(&F->getEntryBlock().front());        Value *Ex = Builder.CreateExtractElement(Vec, Lane); @@ -3269,7 +3681,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {        User->replaceUsesOfWith(Scalar, Ex);      } -    DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); +    LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");    }    // For each vectorized value: @@ -3290,7 +3702,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {        if (!Ty->isVoidTy()) {  #ifndef NDEBUG          for (User *U : Scalar->users()) { -          DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); +          LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");            // It is legal to replace users in the ignorelist by undef.            assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && @@ -3300,7 +3712,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {          Value *Undef = UndefValue::get(Ty);          Scalar->replaceAllUsesWith(Undef);        } -      DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); +      LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");        eraseInstruction(cast<Instruction>(Scalar));      }    } @@ -3310,18 +3722,16 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {    return VectorizableTree[0].VectorizedValue;  } -void BoUpSLP::optimizeGatherSequence(Function &F) { -  DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() -        << " gather sequences instructions.\n"); +void BoUpSLP::optimizeGatherSequence() { +  LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() +                    << " gather sequences instructions.\n");    // LICM InsertElementInst sequences. -  for (Instruction *it : GatherSeq) { -    InsertElementInst *Insert = dyn_cast<InsertElementInst>(it); - -    if (!Insert) +  for (Instruction *I : GatherSeq) { +    if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))        continue;      // Check if this block is inside a loop. -    Loop *L = LI->getLoopFor(Insert->getParent()); +    Loop *L = LI->getLoopFor(I->getParent());      if (!L)        continue; @@ -3333,27 +3743,41 @@ void BoUpSLP::optimizeGatherSequence(Function &F) {      // If the vector or the element that we insert into it are      // instructions that are defined in this basic block then we can't      // hoist this instruction. -    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); -    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); -    if (CurrVec && L->contains(CurrVec)) +    auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); +    auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); +    if (Op0 && L->contains(Op0))        continue; -    if (NewElem && L->contains(NewElem)) +    if (Op1 && L->contains(Op1))        continue;      // We can hoist this instruction. Move it to the pre-header. -    Insert->moveBefore(PreHeader->getTerminator()); +    I->moveBefore(PreHeader->getTerminator());    } +  // Make a list of all reachable blocks in our CSE queue. +  SmallVector<const DomTreeNode *, 8> CSEWorkList; +  CSEWorkList.reserve(CSEBlocks.size()); +  for (BasicBlock *BB : CSEBlocks) +    if (DomTreeNode *N = DT->getNode(BB)) { +      assert(DT->isReachableFromEntry(N)); +      CSEWorkList.push_back(N); +    } + +  // Sort blocks by domination. This ensures we visit a block after all blocks +  // dominating it are visited. +  std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), +                   [this](const DomTreeNode *A, const DomTreeNode *B) { +    return DT->properlyDominates(A, B); +  }); +    // Perform O(N^2) search over the gather sequences and merge identical    // instructions. TODO: We can further optimize this scan if we split the    // instructions into different buckets based on the insert lane.    SmallVector<Instruction *, 16> Visited; -  ReversePostOrderTraversal<Function *> RPOT(&F); -  for (auto BB : RPOT) { -    // Traverse CSEBlocks by RPOT order. -    if (!CSEBlocks.count(BB)) -      continue; - +  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { +    assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && +           "Worklist not sorted properly!"); +    BasicBlock *BB = (*I)->getBlock();      // For all instructions in blocks containing gather sequences:      for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {        Instruction *In = &*it++; @@ -3384,8 +3808,9 @@ void BoUpSLP::optimizeGatherSequence(Function &F) {  // Groups the instructions to a bundle (which is then a single scheduling entity)  // and schedules instructions until the bundle gets ready.  bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, -                                                 BoUpSLP *SLP, Value *OpValue) { -  if (isa<PHINode>(OpValue)) +                                                 BoUpSLP *SLP, +                                                 const InstructionsState &S) { +  if (isa<PHINode>(S.OpValue))      return true;    // Initialize the instruction bundle. @@ -3393,12 +3818,12 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,    ScheduleData *PrevInBundle = nullptr;    ScheduleData *Bundle = nullptr;    bool ReSchedule = false; -  DEBUG(dbgs() << "SLP:  bundle: " << *OpValue << "\n"); +  LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");    // Make sure that the scheduling region contains all    // instructions of the bundle.    for (Value *V : VL) { -    if (!extendSchedulingRegion(V, OpValue)) +    if (!extendSchedulingRegion(V, S))        return false;    } @@ -3410,8 +3835,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,        // A bundle member was scheduled as single instruction before and now        // needs to be scheduled as part of the bundle. We just get rid of the        // existing schedule. -      DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember -                   << " was already scheduled\n"); +      LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember +                        << " was already scheduled\n");        ReSchedule = true;      }      assert(BundleMember->isSchedulingEntity() && @@ -3446,8 +3871,8 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,      initialFillReadyList(ReadyInsts);    } -  DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " -               << BB->getName() << "\n"); +  LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " +                    << BB->getName() << "\n");    calculateDependencies(Bundle, true, SLP); @@ -3465,7 +3890,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,      }    }    if (!Bundle->isReady()) { -    cancelScheduling(VL, OpValue); +    cancelScheduling(VL, S.OpValue);      return false;    }    return true; @@ -3477,7 +3902,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,      return;    ScheduleData *Bundle = getScheduleData(OpValue); -  DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n"); +  LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");    assert(!Bundle->IsScheduled &&           "Can't cancel bundle which is already scheduled");    assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && @@ -3508,13 +3933,13 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {  }  bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, -                                                      Value *OpValue) { -  if (getScheduleData(V, isOneOf(OpValue, V))) +                                                      const InstructionsState &S) { +  if (getScheduleData(V, isOneOf(S, V)))      return true;    Instruction *I = dyn_cast<Instruction>(V);    assert(I && "bundle member must be an instruction");    assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); -  auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool { +  auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {      ScheduleData *ISD = getScheduleData(I);      if (!ISD)        return false; @@ -3522,8 +3947,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,             "ScheduleData not in scheduling region");      ScheduleData *SD = allocateScheduleDataChunks();      SD->Inst = I; -    SD->init(SchedulingRegionID, OpValue); -    ExtraScheduleDataMap[I][OpValue] = SD; +    SD->init(SchedulingRegionID, S.OpValue); +    ExtraScheduleDataMap[I][S.OpValue] = SD;      return true;    };    if (CheckSheduleForI(I)) @@ -3533,10 +3958,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,      initScheduleData(I, I->getNextNode(), nullptr, nullptr);      ScheduleStart = I;      ScheduleEnd = I->getNextNode(); -    if (isOneOf(OpValue, I) != I) +    if (isOneOf(S, I) != I)        CheckSheduleForI(I);      assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); -    DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n"); +    LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");      return true;    }    // Search up and down at the same time, because we don't know if the new @@ -3548,7 +3973,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,    BasicBlock::iterator LowerEnd = BB->end();    while (true) {      if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { -      DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n"); +      LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");        return false;      } @@ -3556,9 +3981,10 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,        if (&*UpIter == I) {          initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);          ScheduleStart = I; -        if (isOneOf(OpValue, I) != I) +        if (isOneOf(S, I) != I)            CheckSheduleForI(I); -        DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n"); +        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I +                          << "\n");          return true;        }        UpIter++; @@ -3568,10 +3994,11 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,          initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,                           nullptr);          ScheduleEnd = I->getNextNode(); -        if (isOneOf(OpValue, I) != I) +        if (isOneOf(S, I) != I)            CheckSheduleForI(I);          assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); -        DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n"); +        LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I +                          << "\n");          return true;        }        DownIter++; @@ -3635,7 +4062,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,        assert(isInSchedulingRegion(BundleMember));        if (!BundleMember->hasValidDependencies()) { -        DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n"); +        LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember +                          << "\n");          BundleMember->Dependencies = 0;          BundleMember->resetUnscheduledDeps(); @@ -3727,7 +4155,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,              // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8              // and we can abort this loop at i6.              if (DistToSrc >= 2 * MaxMemDepDistance) -                break; +              break;              DistToSrc++;            }          } @@ -3736,7 +4164,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,      }      if (InsertInReadyList && SD->isReady()) {        ReadyInsts.push_back(SD); -      DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n"); +      LLVM_DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst +                        << "\n");      }    }  } @@ -3759,7 +4188,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {    if (!BS->ScheduleStart)      return; -  DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); +  LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");    BS->resetSchedule(); @@ -4025,7 +4454,11 @@ void BoUpSLP::computeMinimumValueSizes() {    // We start by looking at each entry that can be demoted. We compute the    // maximum bit width required to store the scalar by using ValueTracking to    // compute the number of high-order bits we can truncate. -  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) { +  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) && +      llvm::all_of(TreeRoot, [](Value *R) { +        assert(R->hasOneUse() && "Root should have only one use!"); +        return isa<GetElementPtrInst>(R->user_back()); +      })) {      MaxBitWidth = 8u;      // Determine if the sign bit of all the roots is known to be zero. If not, @@ -4188,7 +4621,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,    if (F.hasFnAttribute(Attribute::NoImplicitFloat))      return false; -  DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); +  LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");    // Use the bottom up slp vectorizer to construct chains that start with    // store instructions. @@ -4203,8 +4636,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,      // Vectorize trees that end at stores.      if (!Stores.empty()) { -      DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() -                   << " underlying objects.\n"); +      LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() +                        << " underlying objects.\n");        Changed |= vectorizeStoreChains(R);      } @@ -4215,21 +4648,21 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,      // is primarily intended to catch gather-like idioms ending at      // non-consecutive loads.      if (!GEPs.empty()) { -      DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() -                   << " underlying objects.\n"); +      LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() +                        << " underlying objects.\n");        Changed |= vectorizeGEPIndices(BB, R);      }    }    if (Changed) { -    R.optimizeGatherSequence(F); -    DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); -    DEBUG(verifyFunction(F)); +    R.optimizeGatherSequence(); +    LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); +    LLVM_DEBUG(verifyFunction(F));    }    return Changed;  } -/// \brief Check that the Values in the slice in VL array are still existent in +/// Check that the Values in the slice in VL array are still existent in  /// the WeakTrackingVH array.  /// Vectorization of part of the VL array may cause later values in the VL array  /// to become invalid. We track when this has happened in the WeakTrackingVH @@ -4244,30 +4677,28 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,  bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,                                              unsigned VecRegSize) { -  unsigned ChainLen = Chain.size(); -  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen -        << "\n"); -  unsigned Sz = R.getVectorElementSize(Chain[0]); -  unsigned VF = VecRegSize / Sz; +  const unsigned ChainLen = Chain.size(); +  LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen +                    << "\n"); +  const unsigned Sz = R.getVectorElementSize(Chain[0]); +  const unsigned VF = VecRegSize / Sz;    if (!isPowerOf2_32(Sz) || VF < 2)      return false;    // Keep track of values that were deleted by vectorizing in the loop below. -  SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end()); +  const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());    bool Changed = false;    // Look for profitable vectorizable trees at all offsets, starting at zero. -  for (unsigned i = 0, e = ChainLen; i < e; ++i) { -    if (i + VF > e) -      break; +  for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {      // Check that a previous iteration of this loop did not delete the Value.      if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))        continue; -    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i -          << "\n"); +    LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i +                      << "\n");      ArrayRef<Value *> Operands = Chain.slice(i, VF);      R.buildTree(Operands); @@ -4278,9 +4709,10 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,      int Cost = R.getTreeCost(); -    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); +    LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF +                      << "\n");      if (Cost < -SLPCostThreshold) { -      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); +      LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");        using namespace ore; @@ -4417,66 +4849,48 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {    if (!A || !B)      return false;    Value *VL[] = { A, B }; -  return tryToVectorizeList(VL, R, None, true); +  return tryToVectorizeList(VL, R, /*UserCost=*/0, true);  }  bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, -                                           ArrayRef<Value *> BuildVector, -                                           bool AllowReorder, -                                           bool NeedExtraction) { +                                           int UserCost, bool AllowReorder) {    if (VL.size() < 2)      return false; -  DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size() -               << ".\n"); +  LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " +                    << VL.size() << ".\n"); -  // Check that all of the parts are scalar instructions of the same type. -  Instruction *I0 = dyn_cast<Instruction>(VL[0]); -  if (!I0) +  // Check that all of the parts are scalar instructions of the same type, +  // we permit an alternate opcode via InstructionsState. +  InstructionsState S = getSameOpcode(VL); +  if (!S.getOpcode())      return false; -  unsigned Opcode0 = I0->getOpcode(); - +  Instruction *I0 = cast<Instruction>(S.OpValue);    unsigned Sz = R.getVectorElementSize(I0);    unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);    unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);    if (MaxVF < 2) { -     R.getORE()->emit([&]() { -         return OptimizationRemarkMissed( -                    SV_NAME, "SmallVF", I0) -                << "Cannot SLP vectorize list: vectorization factor " -                << "less than 2 is not supported"; -     }); -     return false; +    R.getORE()->emit([&]() { +      return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) +             << "Cannot SLP vectorize list: vectorization factor " +             << "less than 2 is not supported"; +    }); +    return false;    }    for (Value *V : VL) {      Type *Ty = V->getType();      if (!isValidElementType(Ty)) { -      // NOTE: the following will give user internal llvm type name, which may not be useful +      // NOTE: the following will give user internal llvm type name, which may +      // not be useful.        R.getORE()->emit([&]() { -          std::string type_str; -          llvm::raw_string_ostream rso(type_str); -          Ty->print(rso); -          return OptimizationRemarkMissed( -                     SV_NAME, "UnsupportedType", I0) -                 << "Cannot SLP vectorize list: type " -                 << rso.str() + " is unsupported by vectorizer"; -      }); -      return false; -    } -    Instruction *Inst = dyn_cast<Instruction>(V); - -    if (!Inst) -        return false; -    if (Inst->getOpcode() != Opcode0) { -      R.getORE()->emit([&]() { -          return OptimizationRemarkMissed( -                     SV_NAME, "InequableTypes", I0) -                 << "Cannot SLP vectorize list: not all of the " -                 << "parts of scalar instructions are of the same type: " -                 << ore::NV("Instruction1Opcode", I0) << " and " -                 << ore::NV("Instruction2Opcode", Inst); +        std::string type_str; +        llvm::raw_string_ostream rso(type_str); +        Ty->print(rso); +        return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0) +               << "Cannot SLP vectorize list: type " +               << rso.str() + " is unsupported by vectorizer";        });        return false;      } @@ -4513,24 +4927,20 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,        if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))          continue; -      DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " -                   << "\n"); +      LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " +                        << "\n");        ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); -      ArrayRef<Value *> EmptyArray; -      ArrayRef<Value *> BuildVectorSlice; -      if (!BuildVector.empty()) -        BuildVectorSlice = BuildVector.slice(I, OpsWidth); - -      R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice); +      R.buildTree(Ops); +      Optional<ArrayRef<unsigned>> Order = R.bestOrder();        // TODO: check if we can allow reordering for more cases. -      if (AllowReorder && R.shouldReorder()) { +      if (AllowReorder && Order) { +        // TODO: reorder tree nodes without tree rebuilding.          // Conceptually, there is nothing actually preventing us from trying to          // reorder a larger list. In fact, we do exactly this when vectorizing          // reductions. However, at this point, we only expect to get here when          // there are exactly two operations.          assert(Ops.size() == 2); -        assert(BuildVectorSlice.empty());          Value *ReorderedOps[] = {Ops[1], Ops[0]};          R.buildTree(ReorderedOps, None);        } @@ -4538,43 +4948,19 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,          continue;        R.computeMinimumValueSizes(); -      int Cost = R.getTreeCost(); +      int Cost = R.getTreeCost() - UserCost;        CandidateFound = true;        MinCost = std::min(MinCost, Cost);        if (Cost < -SLPCostThreshold) { -        DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); +        LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");          R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",                                                      cast<Instruction>(Ops[0]))                                   << "SLP vectorized with cost " << ore::NV("Cost", Cost)                                   << " and with tree size "                                   << ore::NV("TreeSize", R.getTreeSize())); -        Value *VectorizedRoot = R.vectorizeTree(); - -        // Reconstruct the build vector by extracting the vectorized root. This -        // way we handle the case where some elements of the vector are -        // undefined. -        //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) -        if (!BuildVectorSlice.empty()) { -          // The insert point is the last build vector instruction. The -          // vectorized root will precede it. This guarantees that we get an -          // instruction. The vectorized tree could have been constant folded. -          Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); -          unsigned VecIdx = 0; -          for (auto &V : BuildVectorSlice) { -            IRBuilder<NoFolder> Builder(InsertAfter->getParent(), -                                        ++BasicBlock::iterator(InsertAfter)); -            Instruction *I = cast<Instruction>(V); -            assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I)); -            Instruction *Extract = -                cast<Instruction>(Builder.CreateExtractElement( -                    VectorizedRoot, Builder.getInt32(VecIdx++))); -            I->setOperand(1, Extract); -            I->moveAfter(Extract); -            InsertAfter = I; -          } -        } +        R.vectorizeTree();          // Move to the next bundle.          I += VF - 1;          NextInst = I + 1; @@ -4585,18 +4971,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,    if (!Changed && CandidateFound) {      R.getORE()->emit([&]() { -        return OptimizationRemarkMissed( -                   SV_NAME, "NotBeneficial",  I0) -               << "List vectorization was possible but not beneficial with cost " -               << ore::NV("Cost", MinCost) << " >= " -               << ore::NV("Treshold", -SLPCostThreshold); +      return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0) +             << "List vectorization was possible but not beneficial with cost " +             << ore::NV("Cost", MinCost) << " >= " +             << ore::NV("Treshold", -SLPCostThreshold);      });    } else if (!Changed) {      R.getORE()->emit([&]() { -        return OptimizationRemarkMissed( -                   SV_NAME, "NotPossible", I0) -               << "Cannot SLP vectorize list: vectorization was impossible" -               << " with available vectorization factors"; +      return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0) +             << "Cannot SLP vectorize list: vectorization was impossible" +             << " with available vectorization factors";      });    }    return Changed; @@ -4645,7 +5029,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {    return false;  } -/// \brief Generate a shuffle mask to be used in a reduction tree. +/// Generate a shuffle mask to be used in a reduction tree.  ///  /// \param VecLen The length of the vector to be reduced.  /// \param NumEltsToRdx The number of elements that should be reduced in the @@ -5128,6 +5512,77 @@ class HorizontalReduction {          return OperationData(              Instruction::FCmp, LHS, RHS, RK_Max,              cast<Instruction>(Select->getCondition())->hasNoNaNs()); +      } else { +        // Try harder: look for min/max pattern based on instructions producing +        // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). +        // During the intermediate stages of SLP, it's very common to have +        // pattern like this (since optimizeGatherSequence is run only once +        // at the end): +        // %1 = extractelement <2 x i32> %a, i32 0 +        // %2 = extractelement <2 x i32> %a, i32 1 +        // %cond = icmp sgt i32 %1, %2 +        // %3 = extractelement <2 x i32> %a, i32 0 +        // %4 = extractelement <2 x i32> %a, i32 1 +        // %select = select i1 %cond, i32 %3, i32 %4 +        CmpInst::Predicate Pred; +        Instruction *L1; +        Instruction *L2; + +        LHS = Select->getTrueValue(); +        RHS = Select->getFalseValue(); +        Value *Cond = Select->getCondition(); + +        // TODO: Support inverse predicates. +        if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { +          if (!isa<ExtractElementInst>(RHS) || +              !L2->isIdenticalTo(cast<Instruction>(RHS))) +            return OperationData(V); +        } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { +          if (!isa<ExtractElementInst>(LHS) || +              !L1->isIdenticalTo(cast<Instruction>(LHS))) +            return OperationData(V); +        } else { +          if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) +            return OperationData(V); +          if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || +              !L1->isIdenticalTo(cast<Instruction>(LHS)) || +              !L2->isIdenticalTo(cast<Instruction>(RHS))) +            return OperationData(V); +        } +        switch (Pred) { +        default: +          return OperationData(V); + +        case CmpInst::ICMP_ULT: +        case CmpInst::ICMP_ULE: +          return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin); + +        case CmpInst::ICMP_SLT: +        case CmpInst::ICMP_SLE: +          return OperationData(Instruction::ICmp, LHS, RHS, RK_Min); + +        case CmpInst::FCMP_OLT: +        case CmpInst::FCMP_OLE: +        case CmpInst::FCMP_ULT: +        case CmpInst::FCMP_ULE: +          return OperationData(Instruction::FCmp, LHS, RHS, RK_Min, +                               cast<Instruction>(Cond)->hasNoNaNs()); + +        case CmpInst::ICMP_UGT: +        case CmpInst::ICMP_UGE: +          return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax); + +        case CmpInst::ICMP_SGT: +        case CmpInst::ICMP_SGE: +          return OperationData(Instruction::ICmp, LHS, RHS, RK_Max); + +        case CmpInst::FCMP_OGT: +        case CmpInst::FCMP_OGE: +        case CmpInst::FCMP_UGT: +        case CmpInst::FCMP_UGE: +          return OperationData(Instruction::FCmp, LHS, RHS, RK_Max, +                               cast<Instruction>(Cond)->hasNoNaNs()); +        }        }      }      return OperationData(V); @@ -5136,7 +5591,7 @@ class HorizontalReduction {  public:    HorizontalReduction() = default; -  /// \brief Try to find a reduction tree. +  /// Try to find a reduction tree.    bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {      assert((!Phi || is_contained(Phi->operands(), B)) &&             "Thi phi needs to use the binary operator"); @@ -5164,6 +5619,8 @@ public:      Type *Ty = B->getType();      if (!isValidElementType(Ty))        return false; +    if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy()) +      return false;      ReducedValueData.clear();      ReductionRoot = B; @@ -5262,7 +5719,7 @@ public:      return true;    } -  /// \brief Attempt to vectorize the tree found by +  /// Attempt to vectorize the tree found by    /// matchAssociativeReduction.    bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {      if (ReducedVals.empty()) @@ -5295,9 +5752,14 @@ public:      while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {        auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);        V.buildTree(VL, ExternallyUsedValues, IgnoreList); -      if (V.shouldReorder()) { -        SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend()); -        V.buildTree(Reversed, ExternallyUsedValues, IgnoreList); +      Optional<ArrayRef<unsigned>> Order = V.bestOrder(); +      // TODO: Handle orders of size less than number of elements in the vector. +      if (Order && Order->size() == VL.size()) { +        // TODO: reorder tree nodes without tree rebuilding. +        SmallVector<Value *, 4> ReorderedOps(VL.size()); +        llvm::transform(*Order, ReorderedOps.begin(), +                        [VL](const unsigned Idx) { return VL[Idx]; }); +        V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);        }        if (V.isTreeTinyAndNotFullyVectorizable())          break; @@ -5305,8 +5767,9 @@ public:        V.computeMinimumValueSizes();        // Estimate cost. -      int Cost = -          V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth); +      int TreeCost = V.getTreeCost(); +      int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth); +      int Cost = TreeCost + ReductionCost;        if (Cost >= -SLPCostThreshold) {            V.getORE()->emit([&]() {                return OptimizationRemarkMissed( @@ -5319,8 +5782,8 @@ public:            break;        } -      DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost -                   << ". (HorRdx)\n"); +      LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" +                        << Cost << ". (HorRdx)\n");        V.getORE()->emit([&]() {            return OptimizationRemark(                       SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0])) @@ -5382,7 +5845,7 @@ public:    }  private: -  /// \brief Calculate the cost of a reduction. +  /// Calculate the cost of a reduction.    int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,                         unsigned ReduxWidth) {      Type *ScalarTy = FirstReducedVal->getType(); @@ -5441,16 +5904,16 @@ private:      }      ScalarReduxCost *= (ReduxWidth - 1); -    DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost -                 << " for reduction that starts with " << *FirstReducedVal -                 << " (It is a " -                 << (IsPairwiseReduction ? "pairwise" : "splitting") -                 << " reduction)\n"); +    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost +                      << " for reduction that starts with " << *FirstReducedVal +                      << " (It is a " +                      << (IsPairwiseReduction ? "pairwise" : "splitting") +                      << " reduction)\n");      return VecReduxCost - ScalarReduxCost;    } -  /// \brief Emit a horizontal reduction of the vectorized value. +  /// Emit a horizontal reduction of the vectorized value.    Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,                         unsigned ReduxWidth, const TargetTransformInfo *TTI) {      assert(VectorizedValue && "Need to have a vectorized tree node"); @@ -5486,7 +5949,7 @@ private:  } // end anonymous namespace -/// \brief Recognize construction of vectors like +/// Recognize construction of vectors like  ///  %ra = insertelement <4 x float> undef, float %s0, i32 0  ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1  ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2 @@ -5495,11 +5958,17 @@ private:  ///  /// Returns true if it matches  static bool findBuildVector(InsertElementInst *LastInsertElem, -                            SmallVectorImpl<Value *> &BuildVector, -                            SmallVectorImpl<Value *> &BuildVectorOpds) { +                            TargetTransformInfo *TTI, +                            SmallVectorImpl<Value *> &BuildVectorOpds, +                            int &UserCost) { +  UserCost = 0;    Value *V = nullptr;    do { -    BuildVector.push_back(LastInsertElem); +    if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) { +      UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, +                                          LastInsertElem->getType(), +                                          CI->getZExtValue()); +    }      BuildVectorOpds.push_back(LastInsertElem->getOperand(1));      V = LastInsertElem->getOperand(0);      if (isa<UndefValue>(V)) @@ -5508,20 +5977,17 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,      if (!LastInsertElem || !LastInsertElem->hasOneUse())        return false;    } while (true); -  std::reverse(BuildVector.begin(), BuildVector.end());    std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());    return true;  } -/// \brief Like findBuildVector, but looks for construction of aggregate. +/// Like findBuildVector, but looks for construction of aggregate.  ///  /// \return true if it matches.  static bool findBuildAggregate(InsertValueInst *IV, -                               SmallVectorImpl<Value *> &BuildVector,                                 SmallVectorImpl<Value *> &BuildVectorOpds) {    Value *V;    do { -    BuildVector.push_back(IV);      BuildVectorOpds.push_back(IV->getInsertedValueOperand());      V = IV->getAggregateOperand();      if (isa<UndefValue>(V)) @@ -5530,7 +5996,6 @@ static bool findBuildAggregate(InsertValueInst *IV,      if (!IV || !IV->hasOneUse())        return false;    } while (true); -  std::reverse(BuildVector.begin(), BuildVector.end());    std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());    return true;  } @@ -5539,7 +6004,7 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) {    return V->getType() < V2->getType();  } -/// \brief Try and get a reduction value from a phi node. +/// Try and get a reduction value from a phi node.  ///  /// Given a phi node \p P in a block \p ParentBB, consider possible reductions  /// if they come from either \p ParentBB or a containing loop latch. @@ -5552,9 +6017,8 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,    // reduction phi. Vectorizing such cases has been reported to cause    // miscompiles. See PR25787.    auto DominatedReduxValue = [&](Value *R) { -    return ( -        dyn_cast<Instruction>(R) && -        DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent())); +    return isa<Instruction>(R) && +           DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());    };    Value *Rdx = nullptr; @@ -5624,7 +6088,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(    // Interrupt the process if the Root instruction itself was vectorized or all    // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.    SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0}); -  SmallSet<Value *, 8> VisitedInstrs; +  SmallPtrSet<Value *, 8> VisitedInstrs;    bool Res = false;    while (!Stack.empty()) {      Value *V; @@ -5706,27 +6170,29 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,    if (!R.canMapToVector(IVI->getType(), DL))      return false; -  SmallVector<Value *, 16> BuildVector;    SmallVector<Value *, 16> BuildVectorOpds; -  if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds)) +  if (!findBuildAggregate(IVI, BuildVectorOpds))      return false; -  DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); +  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");    // Aggregate value is unlikely to be processed in vector register, we need to    // extract scalars into scalar registers, so NeedExtraction is set true. -  return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true); +  return tryToVectorizeList(BuildVectorOpds, R);  }  bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,                                                     BasicBlock *BB, BoUpSLP &R) { -  SmallVector<Value *, 16> BuildVector; +  int UserCost;    SmallVector<Value *, 16> BuildVectorOpds; -  if (!findBuildVector(IEI, BuildVector, BuildVectorOpds)) +  if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) || +      (llvm::all_of(BuildVectorOpds, +                    [](Value *V) { return isa<ExtractElementInst>(V); }) && +       isShuffle(BuildVectorOpds)))      return false;    // Vectorize starting with the build vector operands ignoring the BuildVector    // instructions for the purpose of scheduling and user extraction. -  return tryToVectorizeList(BuildVectorOpds, R, BuildVector); +  return tryToVectorizeList(BuildVectorOpds, R, UserCost);  }  bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, @@ -5763,7 +6229,7 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(  bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {    bool Changed = false;    SmallVector<Value *, 4> Incoming; -  SmallSet<Value *, 16> VisitedInstrs; +  SmallPtrSet<Value *, 16> VisitedInstrs;    bool HaveVectorizedPhiNodes = true;    while (HaveVectorizedPhiNodes) { @@ -5798,14 +6264,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {        // Try to vectorize them.        unsigned NumElts = (SameTypeIt - IncIt); -      DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); +      LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs (" +                        << NumElts << ")\n");        // The order in which the phi nodes appear in the program does not matter.        // So allow tryToVectorizeList to reorder them if it is beneficial. This        // is done when there are exactly two elements since tryToVectorizeList        // asserts that there are only two values when AllowReorder is true.        bool AllowReorder = NumElts == 2;        if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, -                                            None, AllowReorder)) { +                                            /*UserCost=*/0, AllowReorder)) {          // Success start over because instructions might have been changed.          HaveVectorizedPhiNodes = true;          Changed = true; @@ -5885,7 +6352,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {      if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||          isa<InsertValueInst>(it))        PostProcessInstructions.push_back(&*it); -    }    return Changed; @@ -5899,8 +6365,8 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {      if (Entry.second.size() < 2)        continue; -    DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " -                 << Entry.second.size() << ".\n"); +    LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " +                      << Entry.second.size() << ".\n");      // We process the getelementptr list in chunks of 16 (like we do for      // stores) to minimize compile-time. @@ -5982,14 +6448,14 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {      if (it->second.size() < 2)        continue; -    DEBUG(dbgs() << "SLP: Analyzing a store chain of length " -          << it->second.size() << ".\n"); +    LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " +                      << it->second.size() << ".\n");      // Process the stores in chunks of 16.      // TODO: The limit of 16 inhibits greater vectorization factors.      //       For example, AVX2 supports v32i8. Increasing this limit, however,      //       may cause a significant compile-time increase. -    for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { +    for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) {        unsigned Len = std::min<unsigned>(CE - CI, 16);        Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);      }  | 
