diff options
Diffstat (limited to 'lib/CodeGen/SelectionDAG/DAGCombiner.cpp')
| -rw-r--r-- | lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2497 | 
1 files changed, 1608 insertions, 889 deletions
| diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 432c86dd6f1e..f97732c1c49d 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1,4 +1,4 @@ -//===-- DAGCombiner.cpp - Implement a DAG node combiner -------------------===// +//===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -16,32 +16,64 @@  //  //===----------------------------------------------------------------------===// +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallBitVector.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/CodeGen/DAGCombine.h" +#include "llvm/CodeGen/ISDOpcodes.h"  #include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h"  #include "llvm/CodeGen/SelectionDAG.h"  #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" +#include "llvm/CodeGen/SelectionDAGNodes.h"  #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constant.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/DerivedTypes.h"  #include "llvm/IR/Function.h"  #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h"  #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/KnownBits.h"  #include "llvm/Support/MathExtras.h"  #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h"  #include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h"  #include <algorithm> +#include <cassert> +#include <cstdint> +#include <functional> +#include <iterator> +#include <string> +#include <tuple> +#include <utility> +#include <vector> +  using namespace llvm;  #define DEBUG_TYPE "dagcombine" @@ -53,43 +85,41 @@ STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");  STATISTIC(LdStFP2Int      , "Number of fp load/store pairs transformed to int");  STATISTIC(SlicedLoads, "Number of load sliced"); -namespace { -  static cl::opt<bool> -    CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, -               cl::desc("Enable DAG combiner's use of IR alias analysis")); +static cl::opt<bool> +CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, +                 cl::desc("Enable DAG combiner's use of IR alias analysis")); -  static cl::opt<bool> -    UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true), -               cl::desc("Enable DAG combiner's use of TBAA")); +static cl::opt<bool> +UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true), +        cl::desc("Enable DAG combiner's use of TBAA"));  #ifndef NDEBUG -  static cl::opt<std::string> -    CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden, -               cl::desc("Only use DAG-combiner alias analysis in this" -                        " function")); +static cl::opt<std::string> +CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden, +                   cl::desc("Only use DAG-combiner alias analysis in this" +                            " function"));  #endif -  /// Hidden option to stress test load slicing, i.e., when this option -  /// is enabled, load slicing bypasses most of its profitability guards. -  static cl::opt<bool> -  StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, -                    cl::desc("Bypass the profitability model of load " -                             "slicing"), -                    cl::init(false)); +/// Hidden option to stress test load slicing, i.e., when this option +/// is enabled, load slicing bypasses most of its profitability guards. +static cl::opt<bool> +StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, +                  cl::desc("Bypass the profitability model of load slicing"), +                  cl::init(false)); -  static cl::opt<bool> -    MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), -                      cl::desc("DAG combiner may split indexing from loads")); +static cl::opt<bool> +  MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), +                    cl::desc("DAG combiner may split indexing from loads")); -//------------------------------ DAGCombiner ---------------------------------// +namespace {    class DAGCombiner {      SelectionDAG &DAG;      const TargetLowering &TLI;      CombineLevel Level;      CodeGenOpt::Level OptLevel; -    bool LegalOperations; -    bool LegalTypes; +    bool LegalOperations = false; +    bool LegalTypes = false;      bool ForCodeSize;      /// \brief Worklist of all of the nodes that need to be simplified. @@ -128,6 +158,19 @@ namespace {      SDValue visit(SDNode *N);    public: +    DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) +        : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), +          OptLevel(OL), AA(AA) { +      ForCodeSize = DAG.getMachineFunction().getFunction().optForSize(); + +      MaximumLegalStoreInBits = 0; +      for (MVT VT : MVT::all_valuetypes()) +        if (EVT(VT).isSimple() && VT != MVT::Other && +            TLI.isTypeLegal(EVT(VT)) && +            VT.getSizeInBits() >= MaximumLegalStoreInBits) +          MaximumLegalStoreInBits = VT.getSizeInBits(); +    } +      /// Add to the worklist making sure its instance is at the back (next to be      /// processed.)      void AddToWorklist(SDNode *N) { @@ -285,7 +328,7 @@ namespace {      SDValue visitSIGN_EXTEND(SDNode *N);      SDValue visitZERO_EXTEND(SDNode *N);      SDValue visitANY_EXTEND(SDNode *N); -    SDValue visitAssertZext(SDNode *N); +    SDValue visitAssertExt(SDNode *N);      SDValue visitSIGN_EXTEND_INREG(SDNode *N);      SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);      SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); @@ -348,6 +391,7 @@ namespace {      SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);      SDValue foldSelectOfConstants(SDNode *N); +    SDValue foldVSelectOfConstants(SDNode *N);      SDValue foldBinOpIntoSelect(SDNode *BO);      bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);      SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N); @@ -371,6 +415,7 @@ namespace {      SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);      SDValue CombineExtLoad(SDNode *N);      SDValue combineRepeatedFPDivisors(SDNode *N); +    SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);      SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);      SDValue BuildSDIV(SDNode *N);      SDValue BuildSDIVPow2(SDNode *N); @@ -400,14 +445,11 @@ namespace {      SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);      SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);      SDValue reduceBuildVecToShuffle(SDNode *N); -    SDValue reduceBuildVecToTrunc(SDNode *N);      SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,                                    ArrayRef<int> VectorMask, SDValue VecIn1,                                    SDValue VecIn2, unsigned LeftIdx);      SDValue matchVSelectOpSizesWithSetCC(SDNode *N); -    SDValue GetDemandedBits(SDValue V, const APInt &Mask); -      /// Walk up chain skipping non-aliasing memory nodes,      /// looking for aliasing nodes and adding them to the Aliases vector.      void GatherAllAliases(SDNode *N, SDValue OriginalChain, @@ -434,12 +476,14 @@ namespace {      /// Holds a pointer to an LSBaseSDNode as well as information on where it      /// is located in a sequence of memory operations connected by a chain.      struct MemOpLink { -      MemOpLink(LSBaseSDNode *N, int64_t Offset) -          : MemNode(N), OffsetFromBase(Offset) {}        // Ptr to the mem node.        LSBaseSDNode *MemNode; +        // Offset from the base ptr.        int64_t OffsetFromBase; + +      MemOpLink(LSBaseSDNode *N, int64_t Offset) +          : MemNode(N), OffsetFromBase(Offset) {}      };      /// This is a helper function for visitMUL to check the profitability @@ -450,38 +494,49 @@ namespace {                                       SDValue &AddNode,                                       SDValue &ConstNode); -      /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns      /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns -    /// the type of the loaded value to be extended.  LoadedVT returns the type -    /// of the original loaded value.  NarrowLoad returns whether the load would -    /// need to be narrowed in order to match. +    /// the type of the loaded value to be extended.      bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, -                          EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT, -                          bool &NarrowLoad); +                          EVT LoadResultTy, EVT &ExtVT); + +    /// Helper function to calculate whether the given Load can have its +    /// width reduced to ExtVT. +    bool isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType, +                           EVT &ExtVT, unsigned ShAmt = 0); + +    /// Used by BackwardsPropagateMask to find suitable loads. +    bool SearchForAndLoads(SDNode *N, SmallPtrSetImpl<LoadSDNode*> &Loads, +                           SmallPtrSetImpl<SDNode*> &NodeWithConsts, +                           ConstantSDNode *Mask, SDNode *&UncombinedNode); +    /// Attempt to propagate a given AND node back to load leaves so that they +    /// can be combined into narrow loads. +    bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG);      /// Helper function for MergeConsecutiveStores which merges the      /// component store chains.      SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,                                  unsigned NumStores); -    /// This is a helper function for MergeConsecutiveStores. When the source -    /// elements of the consecutive stores are all constants or all extracted -    /// vector elements, try to merge them into one larger store. -    /// \return True if a merged store was created. +    /// This is a helper function for MergeConsecutiveStores. When the +    /// source elements of the consecutive stores are all constants or +    /// all extracted vector elements, try to merge them into one +    /// larger store introducing bitcasts if necessary.  \return True +    /// if a merged store was created.      bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,                                           EVT MemVT, unsigned NumStores,                                           bool IsConstantSrc, bool UseVector,                                           bool UseTrunc); -    /// This is a helper function for MergeConsecutiveStores. -    /// Stores that may be merged are placed in StoreNodes. +    /// This is a helper function for MergeConsecutiveStores. Stores +    /// that potentially may be merged with St are placed in +    /// StoreNodes.      void getStoreMergeCandidates(StoreSDNode *St,                                   SmallVectorImpl<MemOpLink> &StoreNodes);      /// Helper function for MergeConsecutiveStores. Checks if -    /// Candidate stores have indirect dependency through their -    /// operands. \return True if safe to merge +    /// candidate stores have indirect dependency through their +    /// operands. \return True if safe to merge.      bool checkMergeStoreCandidatesForDependencies(          SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores); @@ -500,19 +555,6 @@ namespace {      SDValue distributeTruncateThroughAnd(SDNode *N);    public: -    DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) -        : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), -          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) { -      ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize(); - -      MaximumLegalStoreInBits = 0; -      for (MVT VT : MVT::all_valuetypes()) -        if (EVT(VT).isSimple() && VT != MVT::Other && -            TLI.isTypeLegal(EVT(VT)) && -            VT.getSizeInBits() >= MaximumLegalStoreInBits) -          MaximumLegalStoreInBits = VT.getSizeInBits(); -    } -      /// Runs the dag combiner on all nodes in the work list      void Run(CombineLevel AtLevel); @@ -541,14 +583,12 @@ namespace {        return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);      }    }; -} - -namespace {  /// This class is a DAGUpdateListener that removes any deleted  /// nodes from the worklist.  class WorklistRemover : public SelectionDAG::DAGUpdateListener {    DAGCombiner &DC; +  public:    explicit WorklistRemover(DAGCombiner &dc)      : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} @@ -557,7 +597,8 @@ public:      DC.removeFromWorklist(N);    }  }; -} + +} // end anonymous namespace  //===----------------------------------------------------------------------===//  //  TargetLowering::DAGCombinerInfo implementation @@ -577,7 +618,6 @@ CombineTo(SDNode *N, SDValue Res, bool AddTo) {    return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);  } -  SDValue TargetLowering::DAGCombinerInfo::  CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {    return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo); @@ -873,6 +913,56 @@ static bool isAnyConstantBuildVector(const SDNode *N) {           ISD::isBuildVectorOfConstantFPSDNodes(N);  } +// Attempt to match a unary predicate against a scalar/splat constant or +// every element of a constant BUILD_VECTOR. +static bool matchUnaryPredicate(SDValue Op, +                                std::function<bool(ConstantSDNode *)> Match) { +  if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) +    return Match(Cst); + +  if (ISD::BUILD_VECTOR != Op.getOpcode()) +    return false; + +  EVT SVT = Op.getValueType().getScalarType(); +  for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { +    auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i)); +    if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst)) +      return false; +  } +  return true; +} + +// Attempt to match a binary predicate against a pair of scalar/splat constants +// or every element of a pair of constant BUILD_VECTORs. +static bool matchBinaryPredicate( +    SDValue LHS, SDValue RHS, +    std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match) { +  if (LHS.getValueType() != RHS.getValueType()) +    return false; + +  if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS)) +    if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS)) +      return Match(LHSCst, RHSCst); + +  if (ISD::BUILD_VECTOR != LHS.getOpcode() || +      ISD::BUILD_VECTOR != RHS.getOpcode()) +    return false; + +  EVT SVT = LHS.getValueType().getScalarType(); +  for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) { +    auto *LHSCst = dyn_cast<ConstantSDNode>(LHS.getOperand(i)); +    auto *RHSCst = dyn_cast<ConstantSDNode>(RHS.getOperand(i)); +    if (!LHSCst || !RHSCst) +      return false; +    if (LHSCst->getValueType(0) != SVT || +        LHSCst->getValueType(0) != RHSCst->getValueType(0)) +      return false; +    if (!Match(LHSCst, RHSCst)) +      return false; +  } +  return true; +} +  SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,                                      SDValue N1) {    EVT VT = N0.getValueType(); @@ -1123,10 +1213,10 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {      Replace0 &= !N0->hasOneUse();      Replace1 &= (N0 != N1) && !N1->hasOneUse(); -    // Combine Op here so it is presreved past replacements. +    // Combine Op here so it is preserved past replacements.      CombineTo(Op.getNode(), RV); -    // If operands have a use ordering, make sur we deal with +    // If operands have a use ordering, make sure we deal with      // predecessor first.      if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) {        std::swap(N0, N1); @@ -1473,7 +1563,8 @@ SDValue DAGCombiner::visit(SDNode *N) {    case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);    case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);    case ISD::ANY_EXTEND:         return visitANY_EXTEND(N); -  case ISD::AssertZext:         return visitAssertZext(N); +  case ISD::AssertSext: +  case ISD::AssertZext:         return visitAssertExt(N);    case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);    case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);    case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); @@ -1572,15 +1663,15 @@ SDValue DAGCombiner::combine(SDNode *N) {      }    } -  // If N is a commutative binary node, try commuting it to enable more -  // sdisel CSE. +  // If N is a commutative binary node, try eliminate it if the commuted +  // version is already present in the DAG.    if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) &&        N->getNumValues() == 1) {      SDValue N0 = N->getOperand(0);      SDValue N1 = N->getOperand(1);      // Constant operands are canonicalized to RHS. -    if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) { +    if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) {        SDValue Ops[] = {N1, N0};        SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,                                              N->getFlags()); @@ -1632,7 +1723,6 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {      // Check each of the operands.      for (const SDValue &Op : TF->op_values()) { -        switch (Op.getOpcode()) {        case ISD::EntryToken:          // Entry tokens don't need to be added to the list. They are @@ -1907,6 +1997,15 @@ SDValue DAGCombiner::visitADD(SDNode *N) {          return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);        }      } + +    // Undo the add -> or combine to merge constant offsets from a frame index. +    if (N0.getOpcode() == ISD::OR && +        isa<FrameIndexSDNode>(N0.getOperand(0)) && +        isa<ConstantSDNode>(N0.getOperand(1)) && +        DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) { +      SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1)); +      return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0); +    }    }    if (SDValue NewSel = foldBinOpIntoSelect(N)) @@ -2064,7 +2163,8 @@ SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference)    }    // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) -  if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) +  if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) && +      N1.getResNo() == 0)      return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(),                         N0, N1.getOperand(0), N1.getOperand(2)); @@ -2537,6 +2637,12 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {      N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0);      N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1); +    assert((!N0IsConst || +            ConstValue0.getBitWidth() == VT.getScalarSizeInBits()) && +           "Splat APInt should be element width"); +    assert((!N1IsConst || +            ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) && +           "Splat APInt should be element width");    } else {      N0IsConst = isa<ConstantSDNode>(N0);      if (N0IsConst) { @@ -2562,12 +2668,8 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {    // fold (mul x, 0) -> 0    if (N1IsConst && ConstValue1.isNullValue())      return N1; -  // We require a splat of the entire scalar bit width for non-contiguous -  // bit patterns. -  bool IsFullSplat = -    ConstValue1.getBitWidth() == VT.getScalarSizeInBits();    // fold (mul x, 1) -> x -  if (N1IsConst && ConstValue1.isOneValue() && IsFullSplat) +  if (N1IsConst && ConstValue1.isOneValue())      return N0;    if (SDValue NewSel = foldBinOpIntoSelect(N)) @@ -2580,16 +2682,20 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {                         DAG.getConstant(0, DL, VT), N0);    }    // fold (mul x, (1 << c)) -> x << c -  if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isPowerOf2() && -      IsFullSplat) { +  if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && +      DAG.isKnownToBeAPowerOfTwo(N1) && +      (!VT.isVector() || Level <= AfterLegalizeVectorOps)) {      SDLoc DL(N); -    return DAG.getNode(ISD::SHL, DL, VT, N0, -                       DAG.getConstant(ConstValue1.logBase2(), DL, -                                       getShiftAmountTy(N0.getValueType()))); +    SDValue LogBase2 = BuildLogBase2(N1, DL); +    AddToWorklist(LogBase2.getNode()); + +    EVT ShiftVT = getShiftAmountTy(N0.getValueType()); +    SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); +    AddToWorklist(Trunc.getNode()); +    return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);    }    // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c -  if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2() && -      IsFullSplat) { +  if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) {      unsigned Log2Val = (-ConstValue1).logBase2();      SDLoc DL(N);      // FIXME: If the input is something that is easily negated (e.g. a @@ -2835,7 +2941,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {    // If integer divide is expensive and we satisfy the requirements, emit an    // alternate sequence.  Targets may check function attributes for size/speed    // trade-offs. -  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); +  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();    if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))      if (SDValue Op = BuildSDIV(N))        return Op; @@ -2906,7 +3012,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {    }    // fold (udiv x, c) -> alternate -  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); +  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();    if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))      if (SDValue Op = BuildUDIV(N))        return Op; @@ -2965,7 +3071,7 @@ SDValue DAGCombiner::visitREM(SDNode *N) {      }    } -  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); +  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();    // If X/C can be simplified by the division-by-constant logic, lower    // X%C to the equivalent of X-X/C*C. @@ -3003,19 +3109,26 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {    EVT VT = N->getValueType(0);    SDLoc DL(N); +  if (VT.isVector()) { +    // fold (mulhs x, 0) -> 0 +    if (ISD::isBuildVectorAllZeros(N1.getNode())) +      return N1; +    if (ISD::isBuildVectorAllZeros(N0.getNode())) +      return N0; +  } +    // fold (mulhs x, 0) -> 0    if (isNullConstant(N1))      return N1;    // fold (mulhs x, 1) -> (sra x, size(x)-1) -  if (isOneConstant(N1)) { -    SDLoc DL(N); +  if (isOneConstant(N1))      return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,                         DAG.getConstant(N0.getValueSizeInBits() - 1, DL,                                         getShiftAmountTy(N0.getValueType()))); -  } +    // fold (mulhs x, undef) -> 0    if (N0.isUndef() || N1.isUndef()) -    return DAG.getConstant(0, SDLoc(N), VT); +    return DAG.getConstant(0, DL, VT);    // If the type twice as wide is legal, transform the mulhs to a wider multiply    // plus a shift. @@ -3043,6 +3156,14 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {    EVT VT = N->getValueType(0);    SDLoc DL(N); +  if (VT.isVector()) { +    // fold (mulhu x, 0) -> 0 +    if (ISD::isBuildVectorAllZeros(N1.getNode())) +      return N1; +    if (ISD::isBuildVectorAllZeros(N0.getNode())) +      return N0; +  } +    // fold (mulhu x, 0) -> 0    if (isNullConstant(N1))      return N1; @@ -3216,7 +3337,7 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {      if (SDValue FoldedVOp = SimplifyVBinOp(N))        return FoldedVOp; -  // fold (add c1, c2) -> c1+c2 +  // fold operation with constant operands.    ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);    ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);    if (N0C && N1C) @@ -3599,22 +3720,20 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {  }  bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, -                                   EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT, -                                   bool &NarrowLoad) { -  uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits(); - -  if (ActiveBits == 0 || !AndC->getAPIntValue().isMask(ActiveBits)) +                                   EVT LoadResultTy, EVT &ExtVT) { +  if (!AndC->getAPIntValue().isMask())      return false; +  unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); +    ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); -  LoadedVT = LoadN->getMemoryVT(); +  EVT LoadedVT = LoadN->getMemoryVT();    if (ExtVT == LoadedVT &&        (!LegalOperations ||         TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {      // ZEXTLOAD will match without needing to change the size of the value being      // loaded. -    NarrowLoad = false;      return true;    } @@ -3634,10 +3753,185 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,    if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))      return false; -  NarrowLoad = true;    return true;  } +bool DAGCombiner::isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType, +                                    EVT &ExtVT, unsigned ShAmt) { +  // Don't transform one with multiple uses, this would require adding a new +  // load. +  if (!SDValue(LoadN, 0).hasOneUse()) +    return false; + +  if (LegalOperations && +      !TLI.isLoadExtLegal(ExtType, LoadN->getValueType(0), ExtVT)) +    return false; + +  // Do not generate loads of non-round integer types since these can +  // be expensive (and would be wrong if the type is not byte sized). +  if (!ExtVT.isRound()) +    return false; + +  // Don't change the width of a volatile load. +  if (LoadN->isVolatile()) +    return false; + +  // Verify that we are actually reducing a load width here. +  if (LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits()) +    return false; + +  // For the transform to be legal, the load must produce only two values +  // (the value loaded and the chain).  Don't transform a pre-increment +  // load, for example, which produces an extra value.  Otherwise the +  // transformation is not equivalent, and the downstream logic to replace +  // uses gets things wrong. +  if (LoadN->getNumValues() > 2) +    return false; + +  // If the load that we're shrinking is an extload and we're not just +  // discarding the extension we can't simply shrink the load. Bail. +  // TODO: It would be possible to merge the extensions in some cases. +  if (LoadN->getExtensionType() != ISD::NON_EXTLOAD && +      LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt) +    return false; + +  if (!TLI.shouldReduceLoadWidth(LoadN, ExtType, ExtVT)) +    return false; + +  // It's not possible to generate a constant of extended or untyped type. +  EVT PtrType = LoadN->getOperand(1).getValueType(); +  if (PtrType == MVT::Untyped || PtrType.isExtended()) +    return false; + +  return true; +} + +bool DAGCombiner::SearchForAndLoads(SDNode *N, +                                    SmallPtrSetImpl<LoadSDNode*> &Loads, +                                    SmallPtrSetImpl<SDNode*> &NodesWithConsts, +                                    ConstantSDNode *Mask, +                                    SDNode *&NodeToMask) { +  // Recursively search for the operands, looking for loads which can be +  // narrowed. +  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) { +    SDValue Op = N->getOperand(i); + +    if (Op.getValueType().isVector()) +      return false; + +    // Some constants may need fixing up later if they are too large. +    if (auto *C = dyn_cast<ConstantSDNode>(Op)) { +      if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) && +          (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue()) +        NodesWithConsts.insert(N); +      continue; +    } + +    if (!Op.hasOneUse()) +      return false; + +    switch(Op.getOpcode()) { +    case ISD::LOAD: { +      auto *Load = cast<LoadSDNode>(Op); +      EVT ExtVT; +      if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) && +          isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) { +        // Only add this load if we can make it more narrow. +        if (ExtVT.bitsLT(Load->getMemoryVT())) +          Loads.insert(Load); +        continue; +      } +      return false; +    } +    case ISD::ZERO_EXTEND: +    case ISD::ANY_EXTEND: +    case ISD::AssertZext: { +      unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); +      EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); +      EVT VT = Op.getOpcode() == ISD::AssertZext ? +        cast<VTSDNode>(Op.getOperand(1))->getVT() : +        Op.getOperand(0).getValueType(); + +      // We can accept extending nodes if the mask is wider or an equal +      // width to the original type. +      if (ExtVT.bitsGE(VT)) +        continue; +      break; +    } +    case ISD::OR: +    case ISD::XOR: +    case ISD::AND: +      if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask, +                             NodeToMask)) +        return false; +      continue; +    } + +    // Allow one node which will masked along with any loads found. +    if (NodeToMask) +      return false; +    NodeToMask = Op.getNode(); +  } +  return true; +} + +bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) { +  auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); +  if (!Mask) +    return false; + +  if (!Mask->getAPIntValue().isMask()) +    return false; + +  // No need to do anything if the and directly uses a load. +  if (isa<LoadSDNode>(N->getOperand(0))) +    return false; + +  SmallPtrSet<LoadSDNode*, 8> Loads; +  SmallPtrSet<SDNode*, 2> NodesWithConsts; +  SDNode *FixupNode = nullptr; +  if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) { +    if (Loads.size() == 0) +      return false; + +    SDValue MaskOp = N->getOperand(1); + +    // If it exists, fixup the single node we allow in the tree that needs +    // masking. +    if (FixupNode) { +      SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), +                                FixupNode->getValueType(0), +                                SDValue(FixupNode, 0), MaskOp); +      DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And); +      DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), +                             MaskOp); +    } + +    // Narrow any constants that need it. +    for (auto *LogicN : NodesWithConsts) { +      auto *C = cast<ConstantSDNode>(LogicN->getOperand(1)); +      SDValue And = DAG.getNode(ISD::AND, SDLoc(C), C->getValueType(0), +                                SDValue(C, 0), MaskOp); +      DAG.UpdateNodeOperands(LogicN, LogicN->getOperand(0), And); +    } + +    // Create narrow loads. +    for (auto *Load : Loads) { +      SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), +                                SDValue(Load, 0), MaskOp); +      DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); +      DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp); +      SDValue NewLoad = ReduceLoadWidth(And.getNode()); +      assert(NewLoad && +             "Shouldn't be masking the load if it can't be narrowed"); +      CombineTo(Load, NewLoad, NewLoad.getValue(1)); +    } +    DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode()); +    return true; +  } +  return false; +} +  SDValue DAGCombiner::visitAND(SDNode *N) {    SDValue N0 = N->getOperand(0);    SDValue N1 = N->getOperand(1); @@ -3829,55 +4123,23 @@ SDValue DAGCombiner::visitAND(SDNode *N) {    if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD ||                                  (N0.getOpcode() == ISD::ANY_EXTEND &&                                   N0.getOperand(0).getOpcode() == ISD::LOAD))) { -    bool HasAnyExt = N0.getOpcode() == ISD::ANY_EXTEND; -    LoadSDNode *LN0 = HasAnyExt -      ? cast<LoadSDNode>(N0.getOperand(0)) -      : cast<LoadSDNode>(N0); -    if (LN0->getExtensionType() != ISD::SEXTLOAD && -        LN0->isUnindexed() && N0.hasOneUse() && SDValue(LN0, 0).hasOneUse()) { -      auto NarrowLoad = false; -      EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT; -      EVT ExtVT, LoadedVT; -      if (isAndLoadExtLoad(N1C, LN0, LoadResultTy, ExtVT, LoadedVT, -                           NarrowLoad)) { -        if (!NarrowLoad) { -          SDValue NewLoad = -            DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, -                           LN0->getChain(), LN0->getBasePtr(), ExtVT, -                           LN0->getMemOperand()); -          AddToWorklist(N); -          CombineTo(LN0, NewLoad, NewLoad.getValue(1)); -          return SDValue(N, 0);   // Return N so it doesn't get rechecked! -        } else { -          EVT PtrType = LN0->getOperand(1).getValueType(); - -          unsigned Alignment = LN0->getAlignment(); -          SDValue NewPtr = LN0->getBasePtr(); - -          // For big endian targets, we need to add an offset to the pointer -          // to load the correct bytes.  For little endian systems, we merely -          // need to read fewer bytes from the same pointer. -          if (DAG.getDataLayout().isBigEndian()) { -            unsigned LVTStoreBytes = LoadedVT.getStoreSize(); -            unsigned EVTStoreBytes = ExtVT.getStoreSize(); -            unsigned PtrOff = LVTStoreBytes - EVTStoreBytes; -            SDLoc DL(LN0); -            NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, -                                 NewPtr, DAG.getConstant(PtrOff, DL, PtrType)); -            Alignment = MinAlign(Alignment, PtrOff); -          } +    if (SDValue Res = ReduceLoadWidth(N)) { +      LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND +        ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0); -          AddToWorklist(NewPtr.getNode()); +      AddToWorklist(N); +      CombineTo(LN0, Res, Res.getValue(1)); +      return SDValue(N, 0); +    } +  } -          SDValue Load = DAG.getExtLoad( -              ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, LN0->getChain(), NewPtr, -              LN0->getPointerInfo(), ExtVT, Alignment, -              LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); -          AddToWorklist(N); -          CombineTo(LN0, Load, Load.getValue(1)); -          return SDValue(N, 0);   // Return N so it doesn't get rechecked! -        } -      } +  if (Level >= AfterLegalizeTypes) { +    // Attempt to propagate the AND back up to the leaves which, if they're +    // loads, can be combined to narrow loads and the AND node can be removed. +    // Perform after legalization so that extend nodes will already be +    // combined into the loads. +    if (BackwardsPropagateMask(N, DAG)) { +      return SDValue(N, 0);      }    } @@ -3974,7 +4236,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,    if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))      return SDValue(); -  // Recognize (and (shl a, 8), 0xff), (and (srl a, 8), 0xff00) +  // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff)    bool LookPassAnd0 = false;    bool LookPassAnd1 = false;    if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL) @@ -4593,20 +4855,6 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,    return nullptr;  } -// if Left + Right == Sum (constant or constant splat vector) -static bool sumMatchConstant(SDValue Left, SDValue Right, unsigned Sum, -                             SelectionDAG &DAG, const SDLoc &DL) { -  EVT ShiftVT = Left.getValueType(); -  if (ShiftVT != Right.getValueType()) return false; - -  SDValue ShiftSum = DAG.FoldConstantArithmetic(ISD::ADD, DL, ShiftVT, -                         Left.getNode(), Right.getNode()); -  if (!ShiftSum) return false; - -  ConstantSDNode *CSum = isConstOrConstSplat(ShiftSum); -  return CSum && CSum->getZExtValue() == Sum; -} -  // MatchRotate - Handle an 'or' of two operands.  If this is one of the many  // idioms for rotate, and if the target supports rotation instructions, generate  // a rot[lr]. @@ -4620,6 +4868,16 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {    bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT);    if (!HasROTL && !HasROTR) return nullptr; +  // Check for truncated rotate. +  if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE && +      LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) { +    assert(LHS.getValueType() == RHS.getValueType()); +    if (SDNode *Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) { +      return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), +                         SDValue(Rot, 0)).getNode(); +    } +  } +    // Match "(X shl/srl V1) & V2" where V2 may not be present.    SDValue LHSShift;   // The shift.    SDValue LHSMask;    // AND value if any. @@ -4652,7 +4910,11 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {    // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)    // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) -  if (sumMatchConstant(LHSShiftAmt, RHSShiftAmt, EltSizeInBits, DAG, DL)) { +  auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS, +                                        ConstantSDNode *RHS) { +    return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits; +  }; +  if (matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {      SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,                                LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); @@ -4712,20 +4974,22 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {  }  namespace { +  /// Represents known origin of an individual byte in load combine pattern. The  /// value of the byte is either constant zero or comes from memory.  struct ByteProvider {    // For constant zero providers Load is set to nullptr. For memory providers    // Load represents the node which loads the byte from memory.    // ByteOffset is the offset of the byte in the value produced by the load. -  LoadSDNode *Load; -  unsigned ByteOffset; +  LoadSDNode *Load = nullptr; +  unsigned ByteOffset = 0; -  ByteProvider() : Load(nullptr), ByteOffset(0) {} +  ByteProvider() = default;    static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {      return ByteProvider(Load, ByteOffset);    } +    static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }    bool isConstantZero() const { return !Load; } @@ -4740,6 +5004,8 @@ private:        : Load(Load), ByteOffset(ByteOffset) {}  }; +} // end anonymous namespace +  /// Recursively traverses the expression calculating the origin of the requested  /// byte of the given value. Returns None if the provider can't be calculated.  /// @@ -4751,9 +5017,9 @@ private:  /// Because the parts of the expression are not allowed to have more than one  /// use this function iterates over trees, not DAGs. So it never visits the same  /// node more than once. -const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index, -                                                   unsigned Depth, -                                                   bool Root = false) { +static const Optional<ByteProvider> +calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, +                      bool Root = false) {    // Typical i64 by i8 pattern requires recursion up to 8 calls depth    if (Depth == 10)      return None; @@ -4837,7 +5103,6 @@ const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index,    return None;  } -} // namespace  /// Match a pattern where a wide type scalar value is loaded by several narrow  /// loads and combined by shifts and ors. Fold it into a single load or a load @@ -4950,7 +5215,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {      Loads.insert(L);    } -  assert(Loads.size() > 0 && "All the bytes of the value must be loaded from " +  assert(!Loads.empty() && "All the bytes of the value must be loaded from "           "memory, so there must be at least one load which produces the value");    assert(Base && "Base address of the accessed memory location must be set");    assert(FirstOffset != INT64_MAX && "First byte offset must be set"); @@ -5373,7 +5638,11 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {    if (isNullConstantOrNullSplatConstant(N0))      return N0;    // fold (shl x, c >= size(x)) -> undef -  if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) +  // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. +  auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { +    return Val->getAPIntValue().uge(OpSizeInBits); +  }; +  if (matchUnaryPredicate(N1, MatchShiftTooBig))      return DAG.getUNDEF(VT);    // fold (shl x, 0) -> x    if (N1C && N1C->isNullValue()) @@ -5400,20 +5669,29 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {      return SDValue(N, 0);    // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2)) -  if (N1C && N0.getOpcode() == ISD::SHL) { -    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { -      SDLoc DL(N); -      APInt c1 = N0C1->getAPIntValue(); -      APInt c2 = N1C->getAPIntValue(); +  if (N0.getOpcode() == ISD::SHL) { +    auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, +                                          ConstantSDNode *RHS) { +      APInt c1 = LHS->getAPIntValue(); +      APInt c2 = RHS->getAPIntValue();        zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); +      return (c1 + c2).uge(OpSizeInBits); +    }; +    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) +      return DAG.getConstant(0, SDLoc(N), VT); -      APInt Sum = c1 + c2; -      if (Sum.uge(OpSizeInBits)) -        return DAG.getConstant(0, DL, VT); - -      return DAG.getNode( -          ISD::SHL, DL, VT, N0.getOperand(0), -          DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType())); +    auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, +                                       ConstantSDNode *RHS) { +      APInt c1 = LHS->getAPIntValue(); +      APInt c2 = RHS->getAPIntValue(); +      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); +      return (c1 + c2).ult(OpSizeInBits); +    }; +    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { +      SDLoc DL(N); +      EVT ShiftVT = N1.getValueType(); +      SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); +      return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);      }    } @@ -5527,16 +5805,18 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {    }    // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) +  // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)    // Variant of version done on multiply, except mul by a power of 2 is turned    // into a shift. -  if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() && +  if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) && +      N0.getNode()->hasOneUse() &&        isConstantOrConstantVector(N1, /* No Opaques */ true) &&        isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {      SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);      SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);      AddToWorklist(Shl0.getNode());      AddToWorklist(Shl1.getNode()); -    return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1); +    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);    }    // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) @@ -5579,7 +5859,11 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {    if (N0C && N1C && !N1C->isOpaque())      return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);    // fold (sra x, c >= size(x)) -> undef -  if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) +  // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. +  auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { +    return Val->getAPIntValue().uge(OpSizeInBits); +  }; +  if (matchUnaryPredicate(N1, MatchShiftTooBig))      return DAG.getUNDEF(VT);    // fold (sra x, 0) -> x    if (N1C && N1C->isNullValue()) @@ -5603,20 +5887,31 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {    }    // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) -  if (N1C && N0.getOpcode() == ISD::SRA) { -    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { -      SDLoc DL(N); -      APInt c1 = N0C1->getAPIntValue(); -      APInt c2 = N1C->getAPIntValue(); -      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); - -      APInt Sum = c1 + c2; -      if (Sum.uge(OpSizeInBits)) -        Sum = APInt(OpSizeInBits, OpSizeInBits - 1); +  if (N0.getOpcode() == ISD::SRA) { +    SDLoc DL(N); +    EVT ShiftVT = N1.getValueType(); -      return DAG.getNode( -          ISD::SRA, DL, VT, N0.getOperand(0), -          DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType())); +    auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, +                                          ConstantSDNode *RHS) { +      APInt c1 = LHS->getAPIntValue(); +      APInt c2 = RHS->getAPIntValue(); +      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); +      return (c1 + c2).uge(OpSizeInBits); +    }; +    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) +      return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), +                         DAG.getConstant(OpSizeInBits - 1, DL, ShiftVT)); + +    auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, +                                       ConstantSDNode *RHS) { +      APInt c1 = LHS->getAPIntValue(); +      APInt c2 = RHS->getAPIntValue(); +      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); +      return (c1 + c2).ult(OpSizeInBits); +    }; +    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { +      SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); +      return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), Sum);      }    } @@ -5647,7 +5942,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {            TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&            TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&            TLI.isTruncateFree(VT, TruncVT)) { -          SDLoc DL(N);          SDValue Amt = DAG.getConstant(ShiftAmt, DL,              getShiftAmountTy(N0.getOperand(0).getValueType())); @@ -5697,7 +5991,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {    if (N1C && SimplifyDemandedBits(SDValue(N, 0)))      return SDValue(N, 0); -    // If the sign bit is known to be zero, switch this to a SRL.    if (DAG.SignBitIsZero(N0))      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1); @@ -5730,7 +6023,11 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {    if (isNullConstantOrNullSplatConstant(N0))      return N0;    // fold (srl x, c >= size(x)) -> undef -  if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) +  // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. +  auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { +    return Val->getAPIntValue().uge(OpSizeInBits); +  }; +  if (matchUnaryPredicate(N1, MatchShiftTooBig))      return DAG.getUNDEF(VT);    // fold (srl x, 0) -> x    if (N1C && N1C->isNullValue()) @@ -5745,20 +6042,29 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {      return DAG.getConstant(0, SDLoc(N), VT);    // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2)) -  if (N1C && N0.getOpcode() == ISD::SRL) { -    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { -      SDLoc DL(N); -      APInt c1 = N0C1->getAPIntValue(); -      APInt c2 = N1C->getAPIntValue(); +  if (N0.getOpcode() == ISD::SRL) { +    auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, +                                          ConstantSDNode *RHS) { +      APInt c1 = LHS->getAPIntValue(); +      APInt c2 = RHS->getAPIntValue();        zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); +      return (c1 + c2).uge(OpSizeInBits); +    }; +    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) +      return DAG.getConstant(0, SDLoc(N), VT); -      APInt Sum = c1 + c2; -      if (Sum.uge(OpSizeInBits)) -        return DAG.getConstant(0, DL, VT); - -      return DAG.getNode( -          ISD::SRL, DL, VT, N0.getOperand(0), -          DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType())); +    auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, +                                       ConstantSDNode *RHS) { +      APInt c1 = LHS->getAPIntValue(); +      APInt c2 = RHS->getAPIntValue(); +      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); +      return (c1 + c2).ult(OpSizeInBits); +    }; +    if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { +      SDLoc DL(N); +      EVT ShiftVT = N1.getValueType(); +      SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); +      return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);      }    } @@ -6008,7 +6314,6 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) {    return SDValue();  } -  /// \brief Generate Min/Max node  static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,                                     SDValue RHS, SDValue True, SDValue False, @@ -6096,7 +6401,7 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {      // For any constants that differ by 1, we can transform the select into an      // extend and add. Use a target hook because some targets may prefer to      // transform in the other direction. -    if (TLI.convertSelectOfConstantsToMath()) { +    if (TLI.convertSelectOfConstantsToMath(VT)) {        if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {          // select Cond, C1, C1-1 --> add (zext Cond), C1-1          if (VT != MVT::i1) @@ -6371,7 +6676,6 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {  }  SDValue DAGCombiner::visitMSCATTER(SDNode *N) { -    if (Level >= AfterLegalizeTypes)      return SDValue(); @@ -6432,7 +6736,6 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) {  }  SDValue DAGCombiner::visitMSTORE(SDNode *N) { -    if (Level >= AfterLegalizeTypes)      return SDValue(); @@ -6447,7 +6750,6 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {    // prevents the type legalizer from unrolling SETCC into scalar comparisons    // and enables future optimizations (e.g. min/max pattern matching on X86).    if (Mask.getOpcode() == ISD::SETCC) { -      // Check if any splitting is required.      if (TLI.getTypeAction(*DAG.getContext(), VT) !=          TargetLowering::TypeSplitVector) @@ -6504,11 +6806,10 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {  }  SDValue DAGCombiner::visitMGATHER(SDNode *N) { -    if (Level >= AfterLegalizeTypes)      return SDValue(); -  MaskedGatherSDNode *MGT = dyn_cast<MaskedGatherSDNode>(N); +  MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);    SDValue Mask = MGT->getMask();    SDLoc DL(N); @@ -6581,7 +6882,6 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) {  }  SDValue DAGCombiner::visitMLOAD(SDNode *N) { -    if (Level >= AfterLegalizeTypes)      return SDValue(); @@ -6593,7 +6893,6 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {    // SETCC, then split both nodes and its operands before legalization. This    // prevents the type legalizer from unrolling SETCC into scalar comparisons    // and enables future optimizations (e.g. min/max pattern matching on X86). -    if (Mask.getOpcode() == ISD::SETCC) {      EVT VT = N->getValueType(0); @@ -6665,6 +6964,57 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {    return SDValue();  } +/// A vector select of 2 constant vectors can be simplified to math/logic to +/// avoid a variable select instruction and possibly avoid constant loads. +SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) { +  SDValue Cond = N->getOperand(0); +  SDValue N1 = N->getOperand(1); +  SDValue N2 = N->getOperand(2); +  EVT VT = N->getValueType(0); +  if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 || +      !TLI.convertSelectOfConstantsToMath(VT) || +      !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) || +      !ISD::isBuildVectorOfConstantSDNodes(N2.getNode())) +    return SDValue(); + +  // Check if we can use the condition value to increment/decrement a single +  // constant value. This simplifies a select to an add and removes a constant +  // load/materialization from the general case. +  bool AllAddOne = true; +  bool AllSubOne = true; +  unsigned Elts = VT.getVectorNumElements(); +  for (unsigned i = 0; i != Elts; ++i) { +    SDValue N1Elt = N1.getOperand(i); +    SDValue N2Elt = N2.getOperand(i); +    if (N1Elt.isUndef() || N2Elt.isUndef()) +      continue; + +    const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue(); +    const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue(); +    if (C1 != C2 + 1) +      AllAddOne = false; +    if (C1 != C2 - 1) +      AllSubOne = false; +  } + +  // Further simplifications for the extra-special cases where the constants are +  // all 0 or all -1 should be implemented as folds of these patterns. +  SDLoc DL(N); +  if (AllAddOne || AllSubOne) { +    // vselect <N x i1> Cond, C+1, C --> add (zext Cond), C +    // vselect <N x i1> Cond, C-1, C --> add (sext Cond), C +    auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; +    SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond); +    return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2); +  } + +  // The general case for select-of-constants: +  // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2 +  // ...but that only makes sense if a vselect is slower than 2 logic ops, so +  // leave that to a machine-specific pass. +  return SDValue(); +} +  SDValue DAGCombiner::visitVSELECT(SDNode *N) {    SDValue N0 = N->getOperand(0);    SDValue N1 = N->getOperand(1); @@ -6729,6 +7079,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {        return CV;    } +  if (SDValue V = foldVSelectOfConstants(N)) +    return V; +    return SDValue();  } @@ -7243,8 +7596,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {                                      SDLoc(N0.getOperand(0)),                                      N0.getOperand(0).getValueType(), ExtLoad);          ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); +        bool NoReplaceTruncAnd = !N0.hasOneUse();          bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();          CombineTo(N, And); +        // If N0 has multiple uses, change other uses as well. +        if (NoReplaceTruncAnd) { +          SDValue TruncAnd = +              DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); +          CombineTo(N0.getNode(), TruncAnd); +        }          if (NoReplaceTrunc)            DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));          else @@ -7307,7 +7667,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {              SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))        return SCC; -    if (!VT.isVector()) { +    if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) {        EVT SetCCVT = getSetCCResultType(N00VT);        // Don't do this transform for i1 because there's a select transform        // that would reverse it. @@ -7399,20 +7759,6 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {        return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);    } -  // fold (zext (truncate (load x))) -> (zext (smaller load x)) -  // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n))) -  if (N0.getOpcode() == ISD::TRUNCATE) { -    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { -      SDNode *oye = N0.getOperand(0).getNode(); -      if (NarrowLoad.getNode() != N0.getNode()) { -        CombineTo(N0.getNode(), NarrowLoad); -        // CombineTo deleted the truncate, if needed, but not what's under it. -        AddToWorklist(oye); -      } -      return SDValue(N, 0);   // Return N so it doesn't get rechecked! -    } -  } -    // fold (zext (truncate x)) -> (and x, mask)    if (N0.getOpcode() == ISD::TRUNCATE) {      // fold (zext (truncate (load x))) -> (zext (smaller load x)) @@ -7445,7 +7791,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {      if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {        SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);        AddToWorklist(Op.getNode()); -      return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); +      SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); +      // We may safely transfer the debug info describing the truncate node over +      // to the equivalent and operation. +      DAG.transferDbgValues(N0, And); +      return And;      }    } @@ -7522,11 +7872,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {        if (!N0.hasOneUse()) {          if (N0.getOpcode() == ISD::AND) {            auto *AndC = cast<ConstantSDNode>(N0.getOperand(1)); -          auto NarrowLoad = false;            EVT LoadResultTy = AndC->getValueType(0); -          EVT ExtVT, LoadedVT; -          if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT, LoadedVT, -                               NarrowLoad)) +          EVT ExtVT; +          if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT))              DoXform = false;          }          if (DoXform) @@ -7547,8 +7895,15 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {                                      SDLoc(N0.getOperand(0)),                                      N0.getOperand(0).getValueType(), ExtLoad);          ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND); +        bool NoReplaceTruncAnd = !N0.hasOneUse();          bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();          CombineTo(N, And); +        // If N0 has multiple uses, change other uses as well. +        if (NoReplaceTruncAnd) { +          SDValue TruncAnd = +              DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); +          CombineTo(N0.getNode(), TruncAnd); +        }          if (NoReplaceTrunc)            DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));          else @@ -7604,10 +7959,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {        // If the desired elements are smaller or larger than the source        // elements we can use a matching integer vector type and then        // truncate/sign extend. -      EVT MatchingElementType = EVT::getIntegerVT( -          *DAG.getContext(), N00VT.getScalarSizeInBits()); -      EVT MatchingVectorType = EVT::getVectorVT( -          *DAG.getContext(), MatchingElementType, N00VT.getVectorNumElements()); +      EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();        SDValue VsetCC =            DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),                        N0.getOperand(1), N0.getOperand(2)); @@ -7731,7 +8083,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {                        ISD::ANY_EXTEND);        // If the load value is used only by N, replace it via CombineTo N.        bool NoReplaceTrunc = N0.hasOneUse(); -      CombineTo(N, ExtLoad);  +      CombineTo(N, ExtLoad);        if (NoReplaceTrunc)          DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));        else @@ -7769,13 +8121,16 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {      // aext(setcc) -> aext(vsetcc)      // Only do this before legalize for now.      if (VT.isVector() && !LegalOperations) { -      EVT N0VT = N0.getOperand(0).getValueType(); -        // We know that the # elements of the results is the same as the -        // # elements of the compare (and the # elements of the compare result -        // for that matter).  Check to see that they are the same size.  If so, -        // we know that the element size of the sext'd result matches the -        // element size of the compare operands. -      if (VT.getSizeInBits() == N0VT.getSizeInBits()) +      EVT N00VT = N0.getOperand(0).getValueType(); +      if (getSetCCResultType(N00VT) == N0.getValueType()) +        return SDValue(); + +      // We know that the # elements of the results is the same as the +      // # elements of the compare (and the # elements of the compare result +      // for that matter).  Check to see that they are the same size.  If so, +      // we know that the element size of the sext'd result matches the +      // element size of the compare operands. +      if (VT.getSizeInBits() == N00VT.getSizeInBits())          return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),                               N0.getOperand(1),                               cast<CondCodeSDNode>(N0.getOperand(2))->get()); @@ -7783,7 +8138,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {        // elements we can use a matching integer vector type and then        // truncate/any extend        else { -        EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger(); +        EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();          SDValue VsetCC =            DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),                          N0.getOperand(1), @@ -7804,77 +8159,47 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {    return SDValue();  } -SDValue DAGCombiner::visitAssertZext(SDNode *N) { +SDValue DAGCombiner::visitAssertExt(SDNode *N) { +  unsigned Opcode = N->getOpcode();    SDValue N0 = N->getOperand(0);    SDValue N1 = N->getOperand(1); -  EVT EVT = cast<VTSDNode>(N1)->getVT(); +  EVT AssertVT = cast<VTSDNode>(N1)->getVT(); -  // fold (assertzext (assertzext x, vt), vt) -> (assertzext x, vt) -  if (N0.getOpcode() == ISD::AssertZext && -      EVT == cast<VTSDNode>(N0.getOperand(1))->getVT()) +  // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt) +  if (N0.getOpcode() == Opcode && +      AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT())      return N0; -  return SDValue(); -} +  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && +      N0.getOperand(0).getOpcode() == Opcode) { +    // We have an assert, truncate, assert sandwich. Make one stronger assert +    // by asserting on the smallest asserted type to the larger source type. +    // This eliminates the later assert: +    // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN +    // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN +    SDValue BigA = N0.getOperand(0); +    EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); +    assert(BigA_AssertVT.bitsLE(N0.getValueType()) && +           "Asserting zero/sign-extended bits to a type larger than the " +           "truncated destination does not provide information"); -/// See if the specified operand can be simplified with the knowledge that only -/// the bits specified by Mask are used.  If so, return the simpler operand, -/// otherwise return a null SDValue. -/// -/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can -/// simplify nodes with multiple uses more aggressively.) -SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) { -  switch (V.getOpcode()) { -  default: break; -  case ISD::Constant: { -    const ConstantSDNode *CV = cast<ConstantSDNode>(V.getNode()); -    assert(CV && "Const value should be ConstSDNode."); -    const APInt &CVal = CV->getAPIntValue(); -    APInt NewVal = CVal & Mask; -    if (NewVal != CVal) -      return DAG.getConstant(NewVal, SDLoc(V), V.getValueType()); -    break; +    SDLoc DL(N); +    EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT; +    SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT); +    SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), +                                    BigA.getOperand(0), MinAssertVTVal); +    return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);    } -  case ISD::OR: -  case ISD::XOR: -    // If the LHS or RHS don't contribute bits to the or, drop them. -    if (DAG.MaskedValueIsZero(V.getOperand(0), Mask)) -      return V.getOperand(1); -    if (DAG.MaskedValueIsZero(V.getOperand(1), Mask)) -      return V.getOperand(0); -    break; -  case ISD::SRL: -    // Only look at single-use SRLs. -    if (!V.getNode()->hasOneUse()) -      break; -    if (ConstantSDNode *RHSC = getAsNonOpaqueConstant(V.getOperand(1))) { -      // See if we can recursively simplify the LHS. -      unsigned Amt = RHSC->getZExtValue(); -      // Watch out for shift count overflow though. -      if (Amt >= Mask.getBitWidth()) break; -      APInt NewMask = Mask << Amt; -      if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask)) -        return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(), -                           SimplifyLHS, V.getOperand(1)); -    } -    break; -  case ISD::AND: { -    // X & -1 -> X (ignoring bits which aren't demanded). -    ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1)); -    if (AndVal && (AndVal->getAPIntValue() & Mask) == Mask) -      return V.getOperand(0); -    break; -  } -  }    return SDValue();  }  /// If the result of a wider load is shifted to right of N  bits and then  /// truncated to a narrower type and where N is a multiple of number of bits of  /// the narrower type, transform it to a narrower load from address + N / num of -/// bits of new type. If the result is to be extended, also fold the extension -/// to form a extending load. +/// bits of new type. Also narrow the load if the result is masked with an AND +/// to effectively produce a smaller type. If the result is to be extended, also +/// fold the extension to form a extending load.  SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {    unsigned Opc = N->getOpcode(); @@ -7893,28 +8218,40 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {      ExtType = ISD::SEXTLOAD;      ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();    } else if (Opc == ISD::SRL) { -    // Another special-case: SRL is basically zero-extending a narrower value. +    // Another special-case: SRL is basically zero-extending a narrower value, +    // or it maybe shifting a higher subword, half or byte into the lowest +    // bits.      ExtType = ISD::ZEXTLOAD;      N0 = SDValue(N, 0); -    ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); -    if (!N01) return SDValue(); -    ExtVT = EVT::getIntegerVT(*DAG.getContext(), -                              VT.getSizeInBits() - N01->getZExtValue()); -  } -  if (LegalOperations && !TLI.isLoadExtLegal(ExtType, VT, ExtVT)) -    return SDValue(); -  unsigned EVTBits = ExtVT.getSizeInBits(); +    auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0)); +    auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); +    if (!N01 || !LN0) +      return SDValue(); -  // Do not generate loads of non-round integer types since these can -  // be expensive (and would be wrong if the type is not byte sized). -  if (!ExtVT.isRound()) -    return SDValue(); +    uint64_t ShiftAmt = N01->getZExtValue(); +    uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits(); +    if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt) +      ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt); +    else +      ExtVT = EVT::getIntegerVT(*DAG.getContext(), +                                VT.getSizeInBits() - ShiftAmt); +  } else if (Opc == ISD::AND) { +    // An AND with a constant mask is the same as a truncate + zero-extend. +    auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1)); +    if (!AndC || !AndC->getAPIntValue().isMask()) +      return SDValue(); + +    unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); +    ExtType = ISD::ZEXTLOAD; +    ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); +  }    unsigned ShAmt = 0;    if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {      if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {        ShAmt = N01->getZExtValue(); +      unsigned EVTBits = ExtVT.getSizeInBits();        // Is the shift amount a multiple of size of VT?        if ((ShAmt & (EVTBits-1)) == 0) {          N0 = N0.getOperand(0); @@ -7951,42 +8288,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {      }    } -  // If we haven't found a load, we can't narrow it.  Don't transform one with -  // multiple uses, this would require adding a new load. -  if (!isa<LoadSDNode>(N0) || !N0.hasOneUse()) +  // If we haven't found a load, we can't narrow it. +  if (!isa<LoadSDNode>(N0))      return SDValue(); -  // Don't change the width of a volatile load.    LoadSDNode *LN0 = cast<LoadSDNode>(N0); -  if (LN0->isVolatile()) -    return SDValue(); - -  // Verify that we are actually reducing a load width here. -  if (LN0->getMemoryVT().getSizeInBits() < EVTBits) -    return SDValue(); - -  // For the transform to be legal, the load must produce only two values -  // (the value loaded and the chain).  Don't transform a pre-increment -  // load, for example, which produces an extra value.  Otherwise the -  // transformation is not equivalent, and the downstream logic to replace -  // uses gets things wrong. -  if (LN0->getNumValues() > 2) -    return SDValue(); - -  // If the load that we're shrinking is an extload and we're not just -  // discarding the extension we can't simply shrink the load. Bail. -  // TODO: It would be possible to merge the extensions in some cases. -  if (LN0->getExtensionType() != ISD::NON_EXTLOAD && -      LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt) -    return SDValue(); - -  if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT)) -    return SDValue(); - -  EVT PtrType = N0.getOperand(1).getValueType(); - -  if (PtrType == MVT::Untyped || PtrType.isExtended()) -    // It's not possible to generate a constant of extended or untyped type. +  if (!isLegalNarrowLoad(LN0, ExtType, ExtVT, ShAmt))      return SDValue();    // For big endian targets, we need to adjust the offset to the pointer to @@ -7997,6 +8304,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {      ShAmt = LVTStoreBits - EVTStoreBits - ShAmt;    } +  EVT PtrType = N0.getOperand(1).getValueType();    uint64_t PtrOff = ShAmt / 8;    unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);    SDLoc DL(LN0); @@ -8130,10 +8438,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {    }    // fold (sext_inreg (extload x)) -> (sextload x) +  // If sextload is not supported by target, we can only do the combine when +  // load has one use. Doing otherwise can block folding the extload with other +  // extends that the target does support.    if (ISD::isEXTLoad(N0.getNode()) &&        ISD::isUNINDEXEDLoad(N0.getNode()) &&        EVT == cast<LoadSDNode>(N0)->getMemoryVT() && -      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) || +      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile() && +        N0.hasOneUse()) ||         TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {      LoadSDNode *LN0 = cast<LoadSDNode>(N0);      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, @@ -8208,12 +8520,18 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {    // noop truncate    if (N0.getValueType() == N->getValueType(0))      return N0; -  // fold (truncate c1) -> c1 -  if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) -    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); +    // fold (truncate (truncate x)) -> (truncate x)    if (N0.getOpcode() == ISD::TRUNCATE)      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); + +  // fold (truncate c1) -> c1 +  if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { +    SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); +    if (C.getNode() != N) +      return C; +  } +    // fold (truncate (ext x)) -> (ext x) or (truncate x) or x    if (N0.getOpcode() == ISD::ZERO_EXTEND ||        N0.getOpcode() == ISD::SIGN_EXTEND || @@ -8245,7 +8563,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {    // we need to be more careful about the vector instructions that we generate.    if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&        LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) { -      EVT VecTy = N0.getOperand(0).getValueType();      EVT ExTy = N0.getValueType();      EVT TrTy = N->getValueType(0); @@ -8311,7 +8628,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {        N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&        N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&        N0.getOperand(0).hasOneUse()) { -      SDValue BuildVect = N0.getOperand(0);      EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();      EVT TruncVecEltTy = VT.getVectorElementType(); @@ -8340,9 +8656,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {    // Currently we only perform this optimization on scalars because vectors    // may have different active low bits.    if (!VT.isVector()) { -    if (SDValue Shorter = -            GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(), -                                                     VT.getSizeInBits()))) +    APInt Mask = +        APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits()); +    if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask))        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);    } @@ -8413,7 +8729,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {    // Fold truncate of a bitcast of a vector to an extract of the low vector    // element.    // -  // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, 0 +  // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx    if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {      SDValue VecSrc = N0.getOperand(0);      EVT SrcVT = VecSrc.getValueType(); @@ -8423,8 +8739,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {        SDLoc SL(N);        EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); +      unsigned Idx = isLE ? 0 : SrcVT.getVectorNumElements() - 1;        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, -                         VecSrc, DAG.getConstant(0, SL, IdxVT)); +                         VecSrc, DAG.getConstant(Idx, SL, IdxVT));      }    } @@ -8466,11 +8783,18 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {    LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));    LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1)); + +  // A BUILD_PAIR is always having the least significant part in elt 0 and the +  // most significant part in elt 1. So when combining into one large load, we +  // need to consider the endianness. +  if (DAG.getDataLayout().isBigEndian()) +    std::swap(LD1, LD2); +    if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() ||        LD1->getAddressSpace() != LD2->getAddressSpace())      return SDValue();    EVT LD1VT = LD1->getValueType(0); -  unsigned LD1Bytes = LD1VT.getSizeInBits() / 8; +  unsigned LD1Bytes = LD1VT.getStoreSize();    if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&        DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {      unsigned Align = LD1->getAlignment(); @@ -8751,12 +9075,15 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {        if (Op.getOpcode() == ISD::BITCAST &&            Op.getOperand(0).getValueType() == VT)          return SDValue(Op.getOperand(0)); -      if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || +      if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))          return DAG.getBitcast(VT, Op);        return SDValue();      }; +    // FIXME: If either input vector is bitcast, try to convert the shuffle to +    // the result type of this bitcast. This would eliminate at least one +    // bitcast. See the transform in InstCombine.      SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));      SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));      if (!(SV0 && SV1)) @@ -8949,7 +9276,6 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {    // Always prefer FMAD to FMA for precision.    unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;    bool Aggressive = TLI.enableAggressiveFMAFusion(VT); -  bool LookThroughFPExt = TLI.isFPExtFree(VT);    // Is the node an FMUL and contractable either due to global flags or    // SDNodeFlags. @@ -8979,28 +9305,31 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {    }    // Look through FP_EXTEND nodes to do more combining. -  if (LookThroughFPExt) { -    // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) -    if (N0.getOpcode() == ISD::FP_EXTEND) { -      SDValue N00 = N0.getOperand(0); -      if (isContractableFMUL(N00)) -        return DAG.getNode(PreferredFusedOpcode, SL, VT, -                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                       N00.getOperand(0)), -                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                       N00.getOperand(1)), N1); + +  // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) +  if (N0.getOpcode() == ISD::FP_EXTEND) { +    SDValue N00 = N0.getOperand(0); +    if (isContractableFMUL(N00) && +        TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { +      return DAG.getNode(PreferredFusedOpcode, SL, VT, +                         DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                     N00.getOperand(0)), +                         DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                     N00.getOperand(1)), N1);      } +  } -    // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) -    // Note: Commutes FADD operands. -    if (N1.getOpcode() == ISD::FP_EXTEND) { -      SDValue N10 = N1.getOperand(0); -      if (isContractableFMUL(N10)) -        return DAG.getNode(PreferredFusedOpcode, SL, VT, -                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                       N10.getOperand(0)), -                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                       N10.getOperand(1)), N0); +  // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) +  // Note: Commutes FADD operands. +  if (N1.getOpcode() == ISD::FP_EXTEND) { +    SDValue N10 = N1.getOperand(0); +    if (isContractableFMUL(N10) && +        TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { +      return DAG.getNode(PreferredFusedOpcode, SL, VT, +                         DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                     N10.getOperand(0)), +                         DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                     N10.getOperand(1)), N0);      }    } @@ -9036,80 +9365,87 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {                                       N0));      } -    if (LookThroughFPExt) { -      // fold (fadd (fma x, y, (fpext (fmul u, v))), z) -      //   -> (fma x, y, (fma (fpext u), (fpext v), z)) -      auto FoldFAddFMAFPExtFMul = [&] ( -          SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { -        return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, -                           DAG.getNode(PreferredFusedOpcode, SL, VT, -                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, U), -                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, V), -                                       Z)); -      }; -      if (N0.getOpcode() == PreferredFusedOpcode) { -        SDValue N02 = N0.getOperand(2); -        if (N02.getOpcode() == ISD::FP_EXTEND) { -          SDValue N020 = N02.getOperand(0); -          if (isContractableFMUL(N020)) -            return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), -                                        N020.getOperand(0), N020.getOperand(1), -                                        N1); + +    // fold (fadd (fma x, y, (fpext (fmul u, v))), z) +    //   -> (fma x, y, (fma (fpext u), (fpext v), z)) +    auto FoldFAddFMAFPExtFMul = [&] ( +      SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { +      return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, +                         DAG.getNode(PreferredFusedOpcode, SL, VT, +                                     DAG.getNode(ISD::FP_EXTEND, SL, VT, U), +                                     DAG.getNode(ISD::FP_EXTEND, SL, VT, V), +                                     Z)); +    }; +    if (N0.getOpcode() == PreferredFusedOpcode) { +      SDValue N02 = N0.getOperand(2); +      if (N02.getOpcode() == ISD::FP_EXTEND) { +        SDValue N020 = N02.getOperand(0); +        if (isContractableFMUL(N020) && +            TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { +          return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), +                                      N020.getOperand(0), N020.getOperand(1), +                                      N1);          }        } +    } -      // fold (fadd (fpext (fma x, y, (fmul u, v))), z) -      //   -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) -      // FIXME: This turns two single-precision and one double-precision -      // operation into two double-precision operations, which might not be -      // interesting for all targets, especially GPUs. -      auto FoldFAddFPExtFMAFMul = [&] ( -          SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { -        return DAG.getNode(PreferredFusedOpcode, SL, VT, -                           DAG.getNode(ISD::FP_EXTEND, SL, VT, X), -                           DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), -                           DAG.getNode(PreferredFusedOpcode, SL, VT, -                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, U), -                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, V), -                                       Z)); -      }; -      if (N0.getOpcode() == ISD::FP_EXTEND) { -        SDValue N00 = N0.getOperand(0); -        if (N00.getOpcode() == PreferredFusedOpcode) { -          SDValue N002 = N00.getOperand(2); -          if (isContractableFMUL(N002)) -            return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), -                                        N002.getOperand(0), N002.getOperand(1), -                                        N1); +    // fold (fadd (fpext (fma x, y, (fmul u, v))), z) +    //   -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) +    // FIXME: This turns two single-precision and one double-precision +    // operation into two double-precision operations, which might not be +    // interesting for all targets, especially GPUs. +    auto FoldFAddFPExtFMAFMul = [&] ( +      SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { +      return DAG.getNode(PreferredFusedOpcode, SL, VT, +                         DAG.getNode(ISD::FP_EXTEND, SL, VT, X), +                         DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), +                         DAG.getNode(PreferredFusedOpcode, SL, VT, +                                     DAG.getNode(ISD::FP_EXTEND, SL, VT, U), +                                     DAG.getNode(ISD::FP_EXTEND, SL, VT, V), +                                     Z)); +    }; +    if (N0.getOpcode() == ISD::FP_EXTEND) { +      SDValue N00 = N0.getOperand(0); +      if (N00.getOpcode() == PreferredFusedOpcode) { +        SDValue N002 = N00.getOperand(2); +        if (isContractableFMUL(N002) && +            TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { +          return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), +                                      N002.getOperand(0), N002.getOperand(1), +                                      N1);          }        } +    } -      // fold (fadd x, (fma y, z, (fpext (fmul u, v))) -      //   -> (fma y, z, (fma (fpext u), (fpext v), x)) -      if (N1.getOpcode() == PreferredFusedOpcode) { -        SDValue N12 = N1.getOperand(2); -        if (N12.getOpcode() == ISD::FP_EXTEND) { -          SDValue N120 = N12.getOperand(0); -          if (isContractableFMUL(N120)) -            return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), -                                        N120.getOperand(0), N120.getOperand(1), -                                        N0); +    // fold (fadd x, (fma y, z, (fpext (fmul u, v))) +    //   -> (fma y, z, (fma (fpext u), (fpext v), x)) +    if (N1.getOpcode() == PreferredFusedOpcode) { +      SDValue N12 = N1.getOperand(2); +      if (N12.getOpcode() == ISD::FP_EXTEND) { +        SDValue N120 = N12.getOperand(0); +        if (isContractableFMUL(N120) && +            TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { +          return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), +                                      N120.getOperand(0), N120.getOperand(1), +                                      N0);          }        } +    } -      // fold (fadd x, (fpext (fma y, z, (fmul u, v))) -      //   -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) -      // FIXME: This turns two single-precision and one double-precision -      // operation into two double-precision operations, which might not be -      // interesting for all targets, especially GPUs. -      if (N1.getOpcode() == ISD::FP_EXTEND) { -        SDValue N10 = N1.getOperand(0); -        if (N10.getOpcode() == PreferredFusedOpcode) { -          SDValue N102 = N10.getOperand(2); -          if (isContractableFMUL(N102)) -            return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), -                                        N102.getOperand(0), N102.getOperand(1), -                                        N0); +    // fold (fadd x, (fpext (fma y, z, (fmul u, v))) +    //   -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) +    // FIXME: This turns two single-precision and one double-precision +    // operation into two double-precision operations, which might not be +    // interesting for all targets, especially GPUs. +    if (N1.getOpcode() == ISD::FP_EXTEND) { +      SDValue N10 = N1.getOperand(0); +      if (N10.getOpcode() == PreferredFusedOpcode) { +        SDValue N102 = N10.getOperand(2); +        if (isContractableFMUL(N102) && +            TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { +          return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), +                                      N102.getOperand(0), N102.getOperand(1), +                                      N0);          }        }      } @@ -9151,7 +9487,6 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {    // Always prefer FMAD to FMA for precision.    unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;    bool Aggressive = TLI.enableAggressiveFMAFusion(VT); -  bool LookThroughFPExt = TLI.isFPExtFree(VT);    // Is the node an FMUL and contractable either due to global flags or    // SDNodeFlags. @@ -9187,79 +9522,83 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {    }    // Look through FP_EXTEND nodes to do more combining. -  if (LookThroughFPExt) { -    // fold (fsub (fpext (fmul x, y)), z) -    //   -> (fma (fpext x), (fpext y), (fneg z)) -    if (N0.getOpcode() == ISD::FP_EXTEND) { -      SDValue N00 = N0.getOperand(0); -      if (isContractableFMUL(N00)) -        return DAG.getNode(PreferredFusedOpcode, SL, VT, -                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                       N00.getOperand(0)), -                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                       N00.getOperand(1)), -                           DAG.getNode(ISD::FNEG, SL, VT, N1)); + +  // fold (fsub (fpext (fmul x, y)), z) +  //   -> (fma (fpext x), (fpext y), (fneg z)) +  if (N0.getOpcode() == ISD::FP_EXTEND) { +    SDValue N00 = N0.getOperand(0); +    if (isContractableFMUL(N00) && +        TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { +      return DAG.getNode(PreferredFusedOpcode, SL, VT, +                         DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                     N00.getOperand(0)), +                         DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                     N00.getOperand(1)), +                         DAG.getNode(ISD::FNEG, SL, VT, N1));      } +  } -    // fold (fsub x, (fpext (fmul y, z))) -    //   -> (fma (fneg (fpext y)), (fpext z), x) -    // Note: Commutes FSUB operands. -    if (N1.getOpcode() == ISD::FP_EXTEND) { -      SDValue N10 = N1.getOperand(0); -      if (isContractableFMUL(N10)) -        return DAG.getNode(PreferredFusedOpcode, SL, VT, -                           DAG.getNode(ISD::FNEG, SL, VT, +  // fold (fsub x, (fpext (fmul y, z))) +  //   -> (fma (fneg (fpext y)), (fpext z), x) +  // Note: Commutes FSUB operands. +  if (N1.getOpcode() == ISD::FP_EXTEND) { +    SDValue N10 = N1.getOperand(0); +    if (isContractableFMUL(N10) && +        TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { +      return DAG.getNode(PreferredFusedOpcode, SL, VT, +                         DAG.getNode(ISD::FNEG, SL, VT, +                                     DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                                 N10.getOperand(0))), +                         DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                     N10.getOperand(1)), +                         N0); +    } +  } + +  // fold (fsub (fpext (fneg (fmul, x, y))), z) +  //   -> (fneg (fma (fpext x), (fpext y), z)) +  // Note: This could be removed with appropriate canonicalization of the +  // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the +  // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent +  // from implementing the canonicalization in visitFSUB. +  if (N0.getOpcode() == ISD::FP_EXTEND) { +    SDValue N00 = N0.getOperand(0); +    if (N00.getOpcode() == ISD::FNEG) { +      SDValue N000 = N00.getOperand(0); +      if (isContractableFMUL(N000) && +          TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { +        return DAG.getNode(ISD::FNEG, SL, VT, +                           DAG.getNode(PreferredFusedOpcode, SL, VT,                                         DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                   N10.getOperand(0))), -                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                       N10.getOperand(1)), -                           N0); -    } - -    // fold (fsub (fpext (fneg (fmul, x, y))), z) -    //   -> (fneg (fma (fpext x), (fpext y), z)) -    // Note: This could be removed with appropriate canonicalization of the -    // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the -    // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent -    // from implementing the canonicalization in visitFSUB. -    if (N0.getOpcode() == ISD::FP_EXTEND) { -      SDValue N00 = N0.getOperand(0); -      if (N00.getOpcode() == ISD::FNEG) { -        SDValue N000 = N00.getOperand(0); -        if (isContractableFMUL(N000)) { -          return DAG.getNode(ISD::FNEG, SL, VT, -                             DAG.getNode(PreferredFusedOpcode, SL, VT, -                                         DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                     N000.getOperand(0)), -                                         DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                     N000.getOperand(1)), -                                         N1)); -        } +                                                   N000.getOperand(0)), +                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                                   N000.getOperand(1)), +                                       N1));        }      } +  } -    // fold (fsub (fneg (fpext (fmul, x, y))), z) -    //   -> (fneg (fma (fpext x)), (fpext y), z) -    // Note: This could be removed with appropriate canonicalization of the -    // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the -    // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent -    // from implementing the canonicalization in visitFSUB. -    if (N0.getOpcode() == ISD::FNEG) { -      SDValue N00 = N0.getOperand(0); -      if (N00.getOpcode() == ISD::FP_EXTEND) { -        SDValue N000 = N00.getOperand(0); -        if (isContractableFMUL(N000)) { -          return DAG.getNode(ISD::FNEG, SL, VT, -                             DAG.getNode(PreferredFusedOpcode, SL, VT, -                                         DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                     N000.getOperand(0)), -                                         DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                     N000.getOperand(1)), -                                         N1)); -        } +  // fold (fsub (fneg (fpext (fmul, x, y))), z) +  //   -> (fneg (fma (fpext x)), (fpext y), z) +  // Note: This could be removed with appropriate canonicalization of the +  // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the +  // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent +  // from implementing the canonicalization in visitFSUB. +  if (N0.getOpcode() == ISD::FNEG) { +    SDValue N00 = N0.getOperand(0); +    if (N00.getOpcode() == ISD::FP_EXTEND) { +      SDValue N000 = N00.getOperand(0); +      if (isContractableFMUL(N000) && +          TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N000.getValueType())) { +        return DAG.getNode(ISD::FNEG, SL, VT, +                           DAG.getNode(PreferredFusedOpcode, SL, VT, +                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                                   N000.getOperand(0)), +                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                                   N000.getOperand(1)), +                                       N1));        }      } -    }    // More folding opportunities when target permits. @@ -9298,102 +9637,108 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {                                       N21, N0));      } -    if (LookThroughFPExt) { -      // fold (fsub (fma x, y, (fpext (fmul u, v))), z) -      //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) -      if (N0.getOpcode() == PreferredFusedOpcode) { -        SDValue N02 = N0.getOperand(2); -        if (N02.getOpcode() == ISD::FP_EXTEND) { -          SDValue N020 = N02.getOperand(0); -          if (isContractableFMUL(N020)) -            return DAG.getNode(PreferredFusedOpcode, SL, VT, -                               N0.getOperand(0), N0.getOperand(1), -                               DAG.getNode(PreferredFusedOpcode, SL, VT, -                                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                       N020.getOperand(0)), -                                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                       N020.getOperand(1)), -                                           DAG.getNode(ISD::FNEG, SL, VT, -                                                       N1))); -        } -      } -      // fold (fsub (fpext (fma x, y, (fmul u, v))), z) -      //   -> (fma (fpext x), (fpext y), -      //           (fma (fpext u), (fpext v), (fneg z))) -      // FIXME: This turns two single-precision and one double-precision -      // operation into two double-precision operations, which might not be -      // interesting for all targets, especially GPUs. -      if (N0.getOpcode() == ISD::FP_EXTEND) { -        SDValue N00 = N0.getOperand(0); -        if (N00.getOpcode() == PreferredFusedOpcode) { -          SDValue N002 = N00.getOperand(2); -          if (isContractableFMUL(N002)) -            return DAG.getNode(PreferredFusedOpcode, SL, VT, -                               DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                           N00.getOperand(0)), -                               DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                           N00.getOperand(1)), -                               DAG.getNode(PreferredFusedOpcode, SL, VT, -                                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                       N002.getOperand(0)), -                                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                       N002.getOperand(1)), -                                           DAG.getNode(ISD::FNEG, SL, VT, -                                                       N1))); -        } -      } - -      // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) -      //   -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) -      if (N1.getOpcode() == PreferredFusedOpcode && -        N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { -        SDValue N120 = N1.getOperand(2).getOperand(0); -        if (isContractableFMUL(N120)) { -          SDValue N1200 = N120.getOperand(0); -          SDValue N1201 = N120.getOperand(1); +    // fold (fsub (fma x, y, (fpext (fmul u, v))), z) +    //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) +    if (N0.getOpcode() == PreferredFusedOpcode) { +      SDValue N02 = N0.getOperand(2); +      if (N02.getOpcode() == ISD::FP_EXTEND) { +        SDValue N020 = N02.getOperand(0); +        if (isContractableFMUL(N020) && +            TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) {            return DAG.getNode(PreferredFusedOpcode, SL, VT, -                             DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), -                             N1.getOperand(1), +                             N0.getOperand(0), N0.getOperand(1),                               DAG.getNode(PreferredFusedOpcode, SL, VT, -                                         DAG.getNode(ISD::FNEG, SL, VT, -                                             DAG.getNode(ISD::FP_EXTEND, SL, -                                                         VT, N1200)),                                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                     N1201), -                                         N0)); +                                                     N020.getOperand(0)), +                                         DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                                     N020.getOperand(1)), +                                         DAG.getNode(ISD::FNEG, SL, VT, +                                                     N1)));          }        } +    } -      // fold (fsub x, (fpext (fma y, z, (fmul u, v)))) -      //   -> (fma (fneg (fpext y)), (fpext z), -      //           (fma (fneg (fpext u)), (fpext v), x)) -      // FIXME: This turns two single-precision and one double-precision -      // operation into two double-precision operations, which might not be -      // interesting for all targets, especially GPUs. -      if (N1.getOpcode() == ISD::FP_EXTEND && -        N1.getOperand(0).getOpcode() == PreferredFusedOpcode) { -        SDValue N100 = N1.getOperand(0).getOperand(0); -        SDValue N101 = N1.getOperand(0).getOperand(1); -        SDValue N102 = N1.getOperand(0).getOperand(2); -        if (isContractableFMUL(N102)) { -          SDValue N1020 = N102.getOperand(0); -          SDValue N1021 = N102.getOperand(1); +    // fold (fsub (fpext (fma x, y, (fmul u, v))), z) +    //   -> (fma (fpext x), (fpext y), +    //           (fma (fpext u), (fpext v), (fneg z))) +    // FIXME: This turns two single-precision and one double-precision +    // operation into two double-precision operations, which might not be +    // interesting for all targets, especially GPUs. +    if (N0.getOpcode() == ISD::FP_EXTEND) { +      SDValue N00 = N0.getOperand(0); +      if (N00.getOpcode() == PreferredFusedOpcode) { +        SDValue N002 = N00.getOperand(2); +        if (isContractableFMUL(N002) && +            TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {            return DAG.getNode(PreferredFusedOpcode, SL, VT, -                             DAG.getNode(ISD::FNEG, SL, VT, -                                         DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                     N100)), -                             DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), +                             DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                         N00.getOperand(0)), +                             DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                         N00.getOperand(1)),                               DAG.getNode(PreferredFusedOpcode, SL, VT, -                                         DAG.getNode(ISD::FNEG, SL, VT, -                                             DAG.getNode(ISD::FP_EXTEND, SL, -                                                         VT, N1020)),                                           DAG.getNode(ISD::FP_EXTEND, SL, VT, -                                                     N1021), -                                         N0)); +                                                     N002.getOperand(0)), +                                         DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                                     N002.getOperand(1)), +                                         DAG.getNode(ISD::FNEG, SL, VT, +                                                     N1)));          }        }      } + +    // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) +    //   -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) +    if (N1.getOpcode() == PreferredFusedOpcode && +        N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { +      SDValue N120 = N1.getOperand(2).getOperand(0); +      if (isContractableFMUL(N120) && +          TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { +        SDValue N1200 = N120.getOperand(0); +        SDValue N1201 = N120.getOperand(1); +        return DAG.getNode(PreferredFusedOpcode, SL, VT, +                           DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), +                           N1.getOperand(1), +                           DAG.getNode(PreferredFusedOpcode, SL, VT, +                                       DAG.getNode(ISD::FNEG, SL, VT, +                                                   DAG.getNode(ISD::FP_EXTEND, SL, +                                                               VT, N1200)), +                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                                   N1201), +                                       N0)); +      } +    } + +    // fold (fsub x, (fpext (fma y, z, (fmul u, v)))) +    //   -> (fma (fneg (fpext y)), (fpext z), +    //           (fma (fneg (fpext u)), (fpext v), x)) +    // FIXME: This turns two single-precision and one double-precision +    // operation into two double-precision operations, which might not be +    // interesting for all targets, especially GPUs. +    if (N1.getOpcode() == ISD::FP_EXTEND && +        N1.getOperand(0).getOpcode() == PreferredFusedOpcode) { +      SDValue CvtSrc = N1.getOperand(0); +      SDValue N100 = CvtSrc.getOperand(0); +      SDValue N101 = CvtSrc.getOperand(1); +      SDValue N102 = CvtSrc.getOperand(2); +      if (isContractableFMUL(N102) && +          TLI.isFPExtFoldable(PreferredFusedOpcode, VT, CvtSrc.getValueType())) { +        SDValue N1020 = N102.getOperand(0); +        SDValue N1021 = N102.getOperand(1); +        return DAG.getNode(PreferredFusedOpcode, SL, VT, +                           DAG.getNode(ISD::FNEG, SL, VT, +                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                                   N100)), +                           DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), +                           DAG.getNode(PreferredFusedOpcode, SL, VT, +                                       DAG.getNode(ISD::FNEG, SL, VT, +                                                   DAG.getNode(ISD::FP_EXTEND, SL, +                                                               VT, N1020)), +                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, +                                                   N1021), +                                       N0)); +      } +    }    }    return SDValue(); @@ -9959,6 +10304,14 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {        // TODO: The FMA node should have flags that propagate to this node.        return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);      } + +    // fma (fneg x), K, y -> fma x -K, y +    if (N0.getOpcode() == ISD::FNEG && +        (TLI.isOperationLegal(ISD::ConstantFP, VT) || +         (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) { +      return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), +                         DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2); +    }    }    if (Options.UnsafeFPMath) { @@ -10081,8 +10434,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {            (!LegalOperations ||             // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM             // backend)... we should handle this gracefully after Legalize. -           // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) || -           TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) || +           // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) || +           TLI.isOperationLegal(ISD::ConstantFP, VT) ||             TLI.isFPImmLegal(Recip, VT)))          return DAG.getNode(ISD::FMUL, DL, VT, N0,                             DAG.getConstantFP(Recip, DL, VT), Flags); @@ -10264,7 +10617,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {    if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&        // ...but only if the target supports immediate floating-point values        (!LegalOperations || -       TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) +       TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))      return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);    // If the input is a legal type, and SINT_TO_FP is not legal on this target, @@ -10282,7 +10635,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {      if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&          !VT.isVector() &&          (!LegalOperations || -         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { +         TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {        SDLoc DL(N);        SDValue Ops[] =          { N0.getOperand(0), N0.getOperand(1), @@ -10296,7 +10649,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {      if (N0.getOpcode() == ISD::ZERO_EXTEND &&          N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&          (!LegalOperations || -         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { +         TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {        SDLoc DL(N);        SDValue Ops[] =          { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1), @@ -10318,7 +10671,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {    if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&        // ...but only if the target supports immediate floating-point values        (!LegalOperations || -       TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) +       TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))      return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);    // If the input is a legal type, and UINT_TO_FP is not legal on this target, @@ -10333,10 +10686,9 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {    // The next optimizations are desirable only if SELECT_CC can be lowered.    if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {      // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) -      if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&          (!LegalOperations || -         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { +         TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {        SDLoc DL(N);        SDValue Ops[] =          { N0.getOperand(0), N0.getOperand(1), @@ -10557,6 +10909,19 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) {    if (isConstantFPBuildVectorOrConstantFP(N0))      return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0); +  // fold ftrunc (known rounded int x) -> x +  // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is +  // likely to be generated to extract integer from a rounded floating value. +  switch (N0.getOpcode()) { +  default: break; +  case ISD::FRINT: +  case ISD::FTRUNC: +  case ISD::FNEARBYINT: +  case ISD::FFLOOR: +  case ISD::FCEIL: +    return N0; +  } +    return SDValue();  } @@ -11160,6 +11525,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {    // Replace the uses of Ptr with uses of the updated base value.    DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));    deleteAndRecombine(Ptr.getNode()); +  AddToWorklist(Result.getNode());    return true;  } @@ -11445,6 +11811,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {  }  namespace { +  /// \brief Helper structure used to slice a load in smaller loads.  /// Basically a slice is obtained from the following sequence:  /// Origin = load Ty1, Base @@ -11462,21 +11829,19 @@ struct LoadedSlice {    struct Cost {      /// Are we optimizing for code size.      bool ForCodeSize; +      /// Various cost. -    unsigned Loads; -    unsigned Truncates; -    unsigned CrossRegisterBanksCopies; -    unsigned ZExts; -    unsigned Shift; +    unsigned Loads = 0; +    unsigned Truncates = 0; +    unsigned CrossRegisterBanksCopies = 0; +    unsigned ZExts = 0; +    unsigned Shift = 0; -    Cost(bool ForCodeSize = false) -        : ForCodeSize(ForCodeSize), Loads(0), Truncates(0), -          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {} +    Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {}      /// \brief Get the cost of one isolated slice.      Cost(const LoadedSlice &LS, bool ForCodeSize = false) -        : ForCodeSize(ForCodeSize), Loads(1), Truncates(0), -          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) { +        : ForCodeSize(ForCodeSize), Loads(1) {        EVT TruncType = LS.Inst->getValueType(0);        EVT LoadedType = LS.getLoadedType();        if (TruncType != LoadedType && @@ -11538,13 +11903,17 @@ struct LoadedSlice {      bool operator>=(const Cost &RHS) const { return !(*this < RHS); }    }; +    // The last instruction that represent the slice. This should be a    // truncate instruction.    SDNode *Inst; +    // The original load instruction.    LoadSDNode *Origin; +    // The right shift amount in bits from the original load.    unsigned Shift; +    // The DAG from which Origin came from.    // This is used to get some contextual information about legal types, etc.    SelectionDAG *DAG; @@ -11746,7 +12115,8 @@ struct LoadedSlice {      return true;    }  }; -} + +} // end anonymous namespace  /// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,  /// \p UsedBits looks like 0..0 1..1 0..0. @@ -11804,7 +12174,6 @@ static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,    for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,                  // Set the beginning of the pair.                                                             First = Second) { -      Second = &LoadedSlices[CurrSlice];      // If First is NULL, it means we start a new pair. @@ -11935,7 +12304,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {      // will be across several bytes. We do not support that.      unsigned Width = User->getValueSizeInBits(0);      if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7)) -      return 0; +      return false;      // Build the slice for this chain of computations.      LoadedSlice LS(User, LD, Shift, &DAG); @@ -12060,7 +12429,6 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {    return Result;  } -  /// Check to see if IVal is something that provides a value as specified by  /// MaskInfo. If so, replace the specified store with a narrower store of  /// truncated IVal. @@ -12121,7 +12489,6 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,        .getNode();  } -  /// Look for sequence of load / op / store where op is one of 'or', 'xor', and  /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try  /// narrowing the load and store if it would end up being a win for performance @@ -12325,7 +12692,6 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,    // Walk all the users of the constant with which we're multiplying.    for (SDNode *Use : ConstNode->uses()) { -      if (Use == MulNode) // This use is the one we're on right now. Skip it.        continue; @@ -12376,6 +12742,12 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,    return false;  } +static SDValue peekThroughBitcast(SDValue V) { +  while (V.getOpcode() == ISD::BITCAST) +    V = V.getOperand(0); +  return V; +} +  SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,                                           unsigned NumStores) {    SmallVector<SDValue, 8> Chains; @@ -12403,56 +12775,93 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(    if (NumStores < 2)      return false; -  int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; -    // The latest Node in the DAG.    SDLoc DL(StoreNodes[0].MemNode); -  SDValue StoredVal; +  int64_t ElementSizeBits = MemVT.getStoreSizeInBits(); +  unsigned SizeInBits = NumStores * ElementSizeBits; +  unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; + +  EVT StoreTy;    if (UseVector) { -    bool IsVec = MemVT.isVector(); -    unsigned Elts = NumStores; -    if (IsVec) { -      // When merging vector stores, get the total number of elements. -      Elts *= MemVT.getVectorNumElements(); -    } +    unsigned Elts = NumStores * NumMemElts;      // Get the type for the merged vector store. -    EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); -    assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); +    StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); +  } else +    StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); +  SDValue StoredVal; +  if (UseVector) {      if (IsConstantSrc) {        SmallVector<SDValue, 8> BuildVector; -      for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { +      for (unsigned I = 0; I != NumStores; ++I) {          StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);          SDValue Val = St->getValue(); -        if (MemVT.getScalarType().isInteger()) -          if (auto *CFP = dyn_cast<ConstantFPSDNode>(St->getValue())) -            Val = DAG.getConstant( -                (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(), -                SDLoc(CFP), MemVT); +        // If constant is of the wrong type, convert it now. +        if (MemVT != Val.getValueType()) { +          Val = peekThroughBitcast(Val); +          // Deal with constants of wrong size. +          if (ElementSizeBits != Val.getValueSizeInBits()) { +            EVT IntMemVT = +                EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); +            if (isa<ConstantFPSDNode>(Val)) { +              // Not clear how to truncate FP values. +              return false; +            } else if (auto *C = dyn_cast<ConstantSDNode>(Val)) +              Val = DAG.getConstant(C->getAPIntValue() +                                        .zextOrTrunc(Val.getValueSizeInBits()) +                                        .zextOrTrunc(ElementSizeBits), +                                    SDLoc(C), IntMemVT); +          } +          // Make sure correctly size type is the correct type. +          Val = DAG.getBitcast(MemVT, Val); +        }          BuildVector.push_back(Val);        } -      StoredVal = DAG.getBuildVector(Ty, DL, BuildVector); +      StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS +                                               : ISD::BUILD_VECTOR, +                              DL, StoreTy, BuildVector);      } else {        SmallVector<SDValue, 8> Ops;        for (unsigned i = 0; i < NumStores; ++i) {          StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); -        SDValue Val = St->getValue(); -        // All operands of BUILD_VECTOR / CONCAT_VECTOR must have the same type. -        if (Val.getValueType() != MemVT) -          return false; +        SDValue Val = peekThroughBitcast(St->getValue()); +        // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of +        // type MemVT. If the underlying value is not the correct +        // type, but it is an extraction of an appropriate vector we +        // can recast Val to be of the correct type. This may require +        // converting between EXTRACT_VECTOR_ELT and +        // EXTRACT_SUBVECTOR. +        if ((MemVT != Val.getValueType()) && +            (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || +             Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) { +          SDValue Vec = Val.getOperand(0); +          EVT MemVTScalarTy = MemVT.getScalarType(); +          // We may need to add a bitcast here to get types to line up. +          if (MemVTScalarTy != Vec.getValueType()) { +            unsigned Elts = Vec.getValueType().getSizeInBits() / +                            MemVTScalarTy.getSizeInBits(); +            EVT NewVecTy = +                EVT::getVectorVT(*DAG.getContext(), MemVTScalarTy, Elts); +            Vec = DAG.getBitcast(NewVecTy, Vec); +          } +          auto OpC = (MemVT.isVector()) ? ISD::EXTRACT_SUBVECTOR +                                        : ISD::EXTRACT_VECTOR_ELT; +          Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Val.getOperand(1)); +        }          Ops.push_back(Val);        }        // Build the extracted vector elements back into a vector. -      StoredVal = DAG.getNode(IsVec ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, -                              DL, Ty, Ops);    } +      StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS +                                               : ISD::BUILD_VECTOR, +                              DL, StoreTy, Ops); +    }    } else {      // We should always use a vector store when merging extracted vector      // elements, so this path implies a store of constants.      assert(IsConstantSrc && "Merged vector elements should use vector store"); -    unsigned SizeInBits = NumStores * ElementSizeBytes * 8;      APInt StoreInt(SizeInBits, 0);      // Construct a single integer constant which is made of the smaller @@ -12463,18 +12872,25 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(        StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);        SDValue Val = St->getValue(); -      StoreInt <<= ElementSizeBytes * 8; +      StoreInt <<= ElementSizeBits;        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) { -        StoreInt |= C->getAPIntValue().zextOrTrunc(SizeInBits); +        StoreInt |= C->getAPIntValue() +                        .zextOrTrunc(ElementSizeBits) +                        .zextOrTrunc(SizeInBits);        } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) { -        StoreInt |= C->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits); +        StoreInt |= C->getValueAPF() +                        .bitcastToAPInt() +                        .zextOrTrunc(ElementSizeBits) +                        .zextOrTrunc(SizeInBits); +        // If fp truncation is necessary give up for now. +        if (MemVT.getSizeInBits() != ElementSizeBits) +          return false;        } else {          llvm_unreachable("Invalid constant element type");        }      }      // Create the new Load and Store operations. -    EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);      StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);    } @@ -12483,7 +12899,7 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(    // make sure we use trunc store if it's necessary to be legal.    SDValue NewStore; -  if (UseVector || !UseTrunc) { +  if (!UseTrunc) {      NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),                              FirstInChain->getPointerInfo(),                              FirstInChain->getAlignment()); @@ -12517,6 +12933,7 @@ void DAGCombiner::getStoreMergeCandidates(    BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);    EVT MemVT = St->getMemoryVT(); +  SDValue Val = peekThroughBitcast(St->getValue());    // We must have a base and an offset.    if (!BasePtr.getBase().getNode())      return; @@ -12525,47 +12942,62 @@ void DAGCombiner::getStoreMergeCandidates(    if (BasePtr.getBase().isUndef())      return; -  bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) || -                       isa<ConstantFPSDNode>(St->getValue()); -  bool IsExtractVecSrc = -      (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || -       St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR); -  bool IsLoadSrc = isa<LoadSDNode>(St->getValue()); +  bool IsConstantSrc = isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val); +  bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || +                          Val.getOpcode() == ISD::EXTRACT_SUBVECTOR); +  bool IsLoadSrc = isa<LoadSDNode>(Val);    BaseIndexOffset LBasePtr;    // Match on loadbaseptr if relevant. -  if (IsLoadSrc) -    LBasePtr = BaseIndexOffset::match( -        cast<LoadSDNode>(St->getValue())->getBasePtr(), DAG); - +  EVT LoadVT; +  if (IsLoadSrc) { +    auto *Ld = cast<LoadSDNode>(Val); +    LBasePtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); +    LoadVT = Ld->getMemoryVT(); +    // Load and store should be the same type. +    if (MemVT != LoadVT) +      return; +  }    auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,                              int64_t &Offset) -> bool {      if (Other->isVolatile() || Other->isIndexed())        return false; -    // We can merge constant floats to equivalent integers -    if (Other->getMemoryVT() != MemVT) -      if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) && -            isa<ConstantFPSDNode>(Other->getValue()))) -        return false; +    SDValue Val = peekThroughBitcast(Other->getValue()); +    // Allow merging constants of different types as integers. +    bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) +                                           : Other->getMemoryVT() != MemVT;      if (IsLoadSrc) { +      if (NoTypeMatch) +        return false;        // The Load's Base Ptr must also match -      if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Other->getValue())) { +      if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Val)) {          auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG); +        if (LoadVT != OtherLd->getMemoryVT()) +          return false;          if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))            return false;        } else          return false;      } -    if (IsConstantSrc) -      if (!(isa<ConstantSDNode>(Other->getValue()) || -            isa<ConstantFPSDNode>(Other->getValue()))) +    if (IsConstantSrc) { +      if (NoTypeMatch)          return false; -    if (IsExtractVecSrc) -      if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || -            Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR)) +      if (!(isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val))) +        return false; +    } +    if (IsExtractVecSrc) { +      // Do not merge truncated stores here. +      if (Other->isTruncatingStore())          return false; +      if (!MemVT.bitsEq(Val.getValueType())) +        return false; +      if (Val.getOpcode() != ISD::EXTRACT_VECTOR_ELT && +          Val.getOpcode() != ISD::EXTRACT_SUBVECTOR) +        return false; +    }      Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);      return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));    }; +    // We looking for a root node which is an ancestor to all mergable    // stores. We search up through a load, to our root and then down    // through all children. For instance we will find Store{1,2,3} if @@ -12612,10 +13044,8 @@ void DAGCombiner::getStoreMergeCandidates(  // indirectly through its operand (we already consider dependencies  // through the chain). Check in parallel by searching up from  // non-chain operands of candidates. -  bool DAGCombiner::checkMergeStoreCandidatesForDependencies(      SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) { -    // FIXME: We should be able to truncate a full search of    // predecessors by doing a BFS and keeping tabs the originating    // stores from which worklist nodes come from in a similar way to @@ -12648,12 +13078,13 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {      return false;    EVT MemVT = St->getMemoryVT(); -  int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; +  int64_t ElementSizeBytes = MemVT.getStoreSize(); +  unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;    if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)      return false; -  bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute( +  bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute(        Attribute::NoImplicitFloat);    // This function cannot currently deal with non-byte-sized memory sizes. @@ -12665,7 +13096,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {    // Perform an early exit check. Do not bother looking at stored values that    // are not constants, loads, or extracted vector elements. -  SDValue StoredVal = St->getValue(); +  SDValue StoredVal = peekThroughBitcast(St->getValue());    bool IsLoadSrc = isa<LoadSDNode>(StoredVal);    bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||                         isa<ConstantFPSDNode>(StoredVal); @@ -12675,12 +13106,6 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {    if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)      return false; -  // Don't merge vectors into wider vectors if the source data comes from loads. -  // TODO: This restriction can be lifted by using logic similar to the -  // ExtractVecSrc case. -  if (MemVT.isVector() && IsLoadSrc) -    return false; -    SmallVector<MemOpLink, 8> StoreNodes;    // Find potential store merge candidates by searching through chain sub-DAG    getStoreMergeCandidates(St, StoreNodes); @@ -12759,19 +13184,20 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {        unsigned LastLegalVectorType = 1;        bool LastIntegerTrunc = false;        bool NonZero = false; +      unsigned FirstZeroAfterNonZero = NumConsecutiveStores;        for (unsigned i = 0; i < NumConsecutiveStores; ++i) {          StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);          SDValue StoredVal = ST->getValue(); - -        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) { -          NonZero |= !C->isNullValue(); -        } else if (ConstantFPSDNode *C = -                       dyn_cast<ConstantFPSDNode>(StoredVal)) { -          NonZero |= !C->getConstantFPValue()->isNullValue(); -        } else { -          // Non-constant. -          break; +        bool IsElementZero = false; +        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) +          IsElementZero = C->isNullValue(); +        else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) +          IsElementZero = C->getConstantFPValue()->isNullValue(); +        if (IsElementZero) { +          if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores) +            FirstZeroAfterNonZero = i;          } +        NonZero |= !IsElementZero;          // Find a legal type for the constant store.          unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; @@ -12791,8 +13217,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {                TLI.getTypeToTransformTo(Context, StoredVal.getValueType());            if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&                TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) && -              TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, -                                     FirstStoreAS, FirstStoreAlign, &IsFast) && +              TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, +                                     FirstStoreAlign, &IsFast) &&                IsFast) {              LastIntegerTrunc = true;              LastLegalType = i + 1; @@ -12806,13 +13232,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {               TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&              !NoVectors) {            // Find a legal type for the vector store. -          unsigned Elts = i + 1; -          if (MemVT.isVector()) { -            // When merging vector stores, get the total number of elements. -            Elts *= MemVT.getVectorNumElements(); -          } +          unsigned Elts = (i + 1) * NumMemElts;            EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); -          if (TLI.isTypeLegal(Ty) && +          if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&                TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&                TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,                                       FirstStoreAlign, &IsFast) && @@ -12821,23 +13243,34 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {          }        } +      bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; +      unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; +        // Check if we found a legal integer type that creates a meaningful merge. -      if (LastLegalType < 2 && LastLegalVectorType < 2) { -        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); +      if (NumElem < 2) { +        // We know that candidate stores are in order and of correct +        // shape. While there is no mergeable sequence from the +        // beginning one may start later in the sequence. The only +        // reason a merge of size N could have failed where another of +        // the same size would not have, is if the alignment has +        // improved or we've dropped a non-zero value. Drop as many +        // candidates as we can here. +        unsigned NumSkip = 1; +        while ( +            (NumSkip < NumConsecutiveStores) && +            (NumSkip < FirstZeroAfterNonZero) && +            (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) { +          NumSkip++; +        } +        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);          continue;        } -      bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; -      unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; -        bool Merged = MergeStoresOfConstantsOrVecElts(            StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc); -      if (!Merged) { -        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); -        continue; -      } +      RV |= Merged; +        // Remove merged stores for next iteration. -      RV = true;        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);        continue;      } @@ -12849,25 +13282,20 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {        unsigned FirstStoreAS = FirstInChain->getAddressSpace();        unsigned FirstStoreAlign = FirstInChain->getAlignment();        unsigned NumStoresToMerge = 1; -      bool IsVec = MemVT.isVector();        for (unsigned i = 0; i < NumConsecutiveStores; ++i) {          StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); -        unsigned StoreValOpcode = St->getValue().getOpcode(); +        SDValue StVal = peekThroughBitcast(St->getValue());          // This restriction could be loosened.          // Bail out if any stored values are not elements extracted from a          // vector. It should be possible to handle mixed sources, but load          // sources need more careful handling (see the block of code below that          // handles consecutive loads). -        if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT && -            StoreValOpcode != ISD::EXTRACT_SUBVECTOR) +        if (StVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT && +            StVal.getOpcode() != ISD::EXTRACT_SUBVECTOR)            return RV;          // Find a legal type for the vector store. -        unsigned Elts = i + 1; -        if (IsVec) { -          // When merging vector stores, get the total number of elements. -          Elts *= MemVT.getVectorNumElements(); -        } +        unsigned Elts = (i + 1) * NumMemElts;          EVT Ty =              EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);          bool IsFast; @@ -12879,6 +13307,23 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {            NumStoresToMerge = i + 1;        } +      // Check if we found a legal integer type that creates a meaningful merge. +      if (NumStoresToMerge < 2) { +        // We know that candidate stores are in order and of correct +        // shape. While there is no mergeable sequence from the +        // beginning one may start later in the sequence. The only +        // reason a merge of size N could have failed where another of +        // the same size would not have, is if the alignment has +        // improved. Drop as many candidates as we can here. +        unsigned NumSkip = 1; +        while ((NumSkip < NumConsecutiveStores) && +               (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) +          NumSkip++; + +        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); +        continue; +      } +        bool Merged = MergeStoresOfConstantsOrVecElts(            StoreNodes, MemVT, NumStoresToMerge, false, true, false);        if (!Merged) { @@ -12905,7 +13350,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {      BaseIndexOffset LdBasePtr;      for (unsigned i = 0; i < NumConsecutiveStores; ++i) {        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); -      LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue()); +      SDValue Val = peekThroughBitcast(St->getValue()); +      LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val);        if (!Ld)          break; @@ -12917,10 +13363,6 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {        if (Ld->isVolatile() || Ld->isIndexed())          break; -      // We do not accept ext loads. -      if (Ld->getExtensionType() != ISD::NON_EXTLOAD) -        break; -        // The stored memory type must be the same.        if (Ld->getMemoryVT() != MemVT)          break; @@ -12986,7 +13428,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {          isDereferenceable = false;        // Find a legal type for the vector store. -      EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1); +      unsigned Elts = (i + 1) * NumMemElts; +      EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); +        bool IsFastSt, IsFastLd;        if (TLI.isTypeLegal(StoreTy) &&            TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && @@ -13023,8 +13467,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {              TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy,                                 StoreTy) &&              TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && -            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, -                                   FirstStoreAS, FirstStoreAlign, &IsFastSt) && +            TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, +                                   FirstStoreAlign, &IsFastSt) &&              IsFastSt &&              TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,                                     FirstLoadAlign, &IsFastLd) && @@ -13047,7 +13491,19 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {      NumElem = std::min(LastLegalType, NumElem);      if (NumElem < 2) { -      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); +      // We know that candidate stores are in order and of correct +      // shape. While there is no mergeable sequence from the +      // beginning one may start later in the sequence. The only +      // reason a merge of size N could have failed where another of +      // the same size would not have is if the alignment or either +      // the load or store has improved. Drop as many candidates as we +      // can here. +      unsigned NumSkip = 1; +      while ((NumSkip < LoadNodes.size()) && +             (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) && +             (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) +        NumSkip++; +      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);        continue;      } @@ -13055,7 +13511,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {      // to memory.      EVT JointMemOpVT;      if (UseVectorTy) { -      JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem); +      // Find a legal type for the vector store. +      unsigned Elts = NumElem * NumMemElts; +      JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);      } else {        unsigned SizeInBits = NumElem * ElementSizeBytes * 8;        JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); @@ -13104,12 +13562,17 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {                                      SDValue(NewLoad.getNode(), 1));      } -    // Replace the all stores with the new store. -    for (unsigned i = 0; i < NumElem; ++i) +    // Replace the all stores with the new store. Recursively remove +    // corresponding value if its no longer used. +    for (unsigned i = 0; i < NumElem; ++i) { +      SDValue Val = StoreNodes[i].MemNode->getOperand(1);        CombineTo(StoreNodes[i].MemNode, NewStore); +      if (Val.getNode()->use_empty()) +        recursivelyDeleteUnusedNodes(Val.getNode()); +    } +      RV = true;      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); -    continue;    }    return RV;  } @@ -13284,7 +13747,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {      // See if we can simplify the input to this truncstore with knowledge that      // only the low bits are being used.  For example:      // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8" -    SDValue Shorter = GetDemandedBits( +    SDValue Shorter = DAG.GetDemandedBits(          Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),                                      ST->getMemoryVT().getScalarSizeInBits()));      AddToWorklist(Value.getNode()); @@ -13356,11 +13819,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {                               Ptr, ST->getMemoryVT(), ST->getMemOperand());    } -  // Only perform this optimization before the types are legal, because we -  // don't want to perform this optimization on every DAGCombine invocation. -  if ((TLI.mergeStoresAfterLegalization()) ? Level == AfterLegalizeDAG -                                           : !LegalTypes) { -    for (;;) { +  // Always perform this optimization before types are legal. If the target +  // prefers, also try this after legalization to catch stores that were created +  // by intrinsics or other nodes. +  if (!LegalTypes || (TLI.mergeStoresAfterLegalization())) { +    while (true) {        // There can be multiple store sequences on the same chain.        // Keep trying to merge store sequences until we are unable to do so        // or until we merge the last store on the chain. @@ -13499,6 +13962,60 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {    return St1;  } +/// Convert a disguised subvector insertion into a shuffle: +/// insert_vector_elt V, (bitcast X from vector type), IdxC --> +/// bitcast(shuffle (bitcast V), (extended X), Mask) +/// Note: We do not use an insert_subvector node because that requires a legal +/// subvector type. +SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { +  SDValue InsertVal = N->getOperand(1); +  if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() || +      !InsertVal.getOperand(0).getValueType().isVector()) +    return SDValue(); + +  SDValue SubVec = InsertVal.getOperand(0); +  SDValue DestVec = N->getOperand(0); +  EVT SubVecVT = SubVec.getValueType(); +  EVT VT = DestVec.getValueType(); +  unsigned NumSrcElts = SubVecVT.getVectorNumElements(); +  unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits(); +  unsigned NumMaskVals = ExtendRatio * NumSrcElts; + +  // Step 1: Create a shuffle mask that implements this insert operation. The +  // vector that we are inserting into will be operand 0 of the shuffle, so +  // those elements are just 'i'. The inserted subvector is in the first +  // positions of operand 1 of the shuffle. Example: +  // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7} +  SmallVector<int, 16> Mask(NumMaskVals); +  for (unsigned i = 0; i != NumMaskVals; ++i) { +    if (i / NumSrcElts == InsIndex) +      Mask[i] = (i % NumSrcElts) + NumMaskVals; +    else +      Mask[i] = i; +  } + +  // Bail out if the target can not handle the shuffle we want to create. +  EVT SubVecEltVT = SubVecVT.getVectorElementType(); +  EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals); +  if (!TLI.isShuffleMaskLegal(Mask, ShufVT)) +    return SDValue(); + +  // Step 2: Create a wide vector from the inserted source vector by appending +  // undefined elements. This is the same size as our destination vector. +  SDLoc DL(N); +  SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT)); +  ConcatOps[0] = SubVec; +  SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps); + +  // Step 3: Shuffle in the padded subvector. +  SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec); +  SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask); +  AddToWorklist(PaddedSubV.getNode()); +  AddToWorklist(DestVecBC.getNode()); +  AddToWorklist(Shuf.getNode()); +  return DAG.getBitcast(VT, Shuf); +} +  SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {    SDValue InVec = N->getOperand(0);    SDValue InVal = N->getOperand(1); @@ -13511,10 +14028,20 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {    EVT VT = InVec.getValueType(); -  // Check that we know which element is being inserted -  if (!isa<ConstantSDNode>(EltNo)) +  // Remove redundant insertions: +  // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x +  if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && +      InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1)) +    return InVec; + +  // We must know which element is being inserted for folds below here. +  auto *IndexC = dyn_cast<ConstantSDNode>(EltNo); +  if (!IndexC)      return SDValue(); -  unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); +  unsigned Elt = IndexC->getZExtValue(); + +  if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) +    return Shuf;    // Canonicalize insert_vector_elt dag nodes.    // Example: @@ -13692,9 +14219,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {      // converts.    } -  // extract_vector_elt (v2i32 (bitcast i64:x)), 0 -> i32 (trunc i64:x) +  // extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x) +  bool isLE = DAG.getDataLayout().isLittleEndian(); +  unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1;    if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() && -      ConstEltNo->isNullValue() && VT.isInteger()) { +      ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) {      SDValue BCSrc = InVec.getOperand(0);      if (BCSrc.getValueType().isScalarInteger())        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc); @@ -13748,7 +14277,10 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {      // FIXME: We should handle recursing on other vector shuffles and      // scalar_to_vector here as well. -    if (!LegalOperations) { +    if (!LegalOperations || +        // FIXME: Should really be just isOperationLegalOrCustom. +        TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VT) || +        TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VT)) {        EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec,                           DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy)); @@ -14054,10 +14586,18 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,    EVT InVT1 = VecIn1.getValueType();    EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; -  unsigned Vec2Offset = InVT1.getVectorNumElements(); +  unsigned Vec2Offset = 0;    unsigned NumElems = VT.getVectorNumElements();    unsigned ShuffleNumElems = NumElems; +  // In case both the input vectors are extracted from same base +  // vector we do not need extra addend (Vec2Offset) while +  // computing shuffle mask. +  if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) || +      !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) || +      !(VecIn1.getOperand(0) == VecIn2.getOperand(0))) +    Vec2Offset = InVT1.getVectorNumElements(); +    // We can't generate a shuffle node with mismatched input and output types.    // Try to make the types match the type of the output.    if (InVT1 != VT || InVT2 != VT) { @@ -14072,7 +14612,7 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,        VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);        VecIn2 = SDValue();      } else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) { -      if (!TLI.isExtractSubvectorCheap(VT, NumElems)) +      if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))          return SDValue();        if (!VecIn2.getNode()) { @@ -14204,7 +14744,6 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {      if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||          !isa<ConstantSDNode>(Op.getOperand(1)))        return SDValue(); -      SDValue ExtractedFromVec = Op.getOperand(0);      // All inputs must have the same element type as the output. @@ -14227,6 +14766,50 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {    if (VecIn.size() < 2)      return SDValue(); +  // If all the Operands of BUILD_VECTOR extract from same +  // vector, then split the vector efficiently based on the maximum +  // vector access index and adjust the VectorMask and +  // VecIn accordingly. +  if (VecIn.size() == 2) { +    unsigned MaxIndex = 0; +    unsigned NearestPow2 = 0; +    SDValue Vec = VecIn.back(); +    EVT InVT = Vec.getValueType(); +    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); +    SmallVector<unsigned, 8> IndexVec(NumElems, 0); + +    for (unsigned i = 0; i < NumElems; i++) { +      if (VectorMask[i] <= 0) +        continue; +      unsigned Index = N->getOperand(i).getConstantOperandVal(1); +      IndexVec[i] = Index; +      MaxIndex = std::max(MaxIndex, Index); +    } + +    NearestPow2 = PowerOf2Ceil(MaxIndex); +    if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 && +        NumElems * 2 < NearestPow2) { +      unsigned SplitSize = NearestPow2 / 2; +      EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), +                                     InVT.getVectorElementType(), SplitSize); +      if (TLI.isTypeLegal(SplitVT)) { +        SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, +                                     DAG.getConstant(SplitSize, DL, IdxTy)); +        SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, +                                     DAG.getConstant(0, DL, IdxTy)); +        VecIn.pop_back(); +        VecIn.push_back(VecIn1); +        VecIn.push_back(VecIn2); + +        for (unsigned i = 0; i < NumElems; i++) { +          if (VectorMask[i] <= 0) +            continue; +          VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2; +        } +      } +    } +  } +    // TODO: We want to sort the vectors by descending length, so that adjacent    // pairs have similar length, and the longer vector is always first in the    // pair. @@ -14315,77 +14898,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {            DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask);      }    } -    return Shuffles[0];  } -// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT -// operations which can be matched to a truncate. -SDValue DAGCombiner::reduceBuildVecToTrunc(SDNode *N) { -  // TODO: Add support for big-endian. -  if (DAG.getDataLayout().isBigEndian()) -    return SDValue(); -  if (N->getNumOperands() < 2) -    return SDValue(); -  SDLoc DL(N); -  EVT VT = N->getValueType(0); -  unsigned NumElems = N->getNumOperands(); - -  if (!isTypeLegal(VT)) -    return SDValue(); - -  // If the input is something other than an EXTRACT_VECTOR_ELT with a constant -  // index, bail out. -  // TODO: Allow undef elements in some cases? -  if (any_of(N->ops(), [VT](SDValue Op) { -        return Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || -               !isa<ConstantSDNode>(Op.getOperand(1)) || -               Op.getValueType() != VT.getVectorElementType(); -      })) -    return SDValue(); - -  // Helper for obtaining an EXTRACT_VECTOR_ELT's constant index -  auto GetExtractIdx = [](SDValue Extract) { -    return cast<ConstantSDNode>(Extract.getOperand(1))->getSExtValue(); -  }; - -  // The first BUILD_VECTOR operand must be an an extract from index zero -  // (assuming no undef and little-endian). -  if (GetExtractIdx(N->getOperand(0)) != 0) -    return SDValue(); - -  // Compute the stride from the first index. -  int Stride = GetExtractIdx(N->getOperand(1)); -  SDValue ExtractedFromVec = N->getOperand(0).getOperand(0); - -  // Proceed only if the stride and the types can be matched to a truncate. -  if ((Stride == 1 || !isPowerOf2_32(Stride)) || -      (ExtractedFromVec.getValueType().getVectorNumElements() != -       Stride * NumElems) || -      (VT.getScalarSizeInBits() * Stride > 64)) -    return SDValue(); - -  // Check remaining operands are consistent with the computed stride. -  for (unsigned i = 1; i != NumElems; ++i) { -    SDValue Op = N->getOperand(i); - -    if ((Op.getOperand(0) != ExtractedFromVec) || -        (GetExtractIdx(Op) != Stride * i)) -      return SDValue(); -  } - -  // All checks were ok, construct the truncate. -  LLVMContext &Ctx = *DAG.getContext(); -  EVT NewVT = VT.getVectorVT( -      Ctx, EVT::getIntegerVT(Ctx, VT.getScalarSizeInBits() * Stride), NumElems); -  EVT TruncVT = -      VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; - -  SDValue Res = DAG.getBitcast(NewVT, ExtractedFromVec); -  Res = DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, Res); -  return DAG.getBitcast(VT, Res); -} -  SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {    EVT VT = N->getValueType(0); @@ -14428,10 +14943,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {    if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N))      return V; -  if (TLI.isDesirableToCombineBuildVectorToTruncate()) -    if (SDValue V = reduceBuildVecToTrunc(N)) -      return V; -    if (SDValue V = reduceBuildVecToShuffle(N))      return V; @@ -14514,8 +15025,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {    for (SDValue Op : N->ops()) {      // Peek through any bitcast. -    while (Op.getOpcode() == ISD::BITCAST) -      Op = Op.getOperand(0); +    Op = peekThroughBitcast(Op);      // UNDEF nodes convert to UNDEF shuffle mask values.      if (Op.isUndef()) { @@ -14534,8 +15044,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {      EVT ExtVT = ExtVec.getValueType();      // Peek through any bitcast. -    while (ExtVec.getOpcode() == ISD::BITCAST) -      ExtVec = ExtVec.getOperand(0); +    ExtVec = peekThroughBitcast(ExtVec);      // UNDEF nodes convert to UNDEF shuffle mask values.      if (ExtVec.isUndef()) { @@ -14760,9 +15269,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {    // We are looking for an optionally bitcasted wide vector binary operator    // feeding an extract subvector. -  SDValue BinOp = Extract->getOperand(0); -  if (BinOp.getOpcode() == ISD::BITCAST) -    BinOp = BinOp.getOperand(0); +  SDValue BinOp = peekThroughBitcast(Extract->getOperand(0));    // TODO: The motivating case for this transform is an x86 AVX1 target. That    // target has temptingly almost legal versions of bitwise logic ops in 256-bit @@ -14786,13 +15293,8 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {      return SDValue();    // Peek through bitcasts of the binary operator operands if needed. -  SDValue LHS = BinOp.getOperand(0); -  if (LHS.getOpcode() == ISD::BITCAST) -    LHS = LHS.getOperand(0); - -  SDValue RHS = BinOp.getOperand(1); -  if (RHS.getOpcode() == ISD::BITCAST) -    RHS = RHS.getOperand(0); +  SDValue LHS = peekThroughBitcast(BinOp.getOperand(0)); +  SDValue RHS = peekThroughBitcast(BinOp.getOperand(1));    // We need at least one concatenation operation of a binop operand to make    // this transform worthwhile. The concat must double the input vector sizes. @@ -14891,8 +15393,34 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {    }    // Skip bitcasting -  if (V->getOpcode() == ISD::BITCAST) -    V = V.getOperand(0); +  V = peekThroughBitcast(V); + +  // If the input is a build vector. Try to make a smaller build vector. +  if (V->getOpcode() == ISD::BUILD_VECTOR) { +    if (auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))) { +      EVT InVT = V->getValueType(0); +      unsigned ExtractSize = NVT.getSizeInBits(); +      unsigned EltSize = InVT.getScalarSizeInBits(); +      // Only do this if we won't split any elements. +      if (ExtractSize % EltSize == 0) { +        unsigned NumElems = ExtractSize / EltSize; +        EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(), +                                         InVT.getVectorElementType(), NumElems); +        if ((!LegalOperations || +             TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT)) && +            (!LegalTypes || TLI.isTypeLegal(ExtractVT))) { +          unsigned IdxVal = (Idx->getZExtValue() * NVT.getScalarSizeInBits()) / +                            EltSize; + +          // Extract the pieces from the original build_vector. +          SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N), +                                            makeArrayRef(V->op_begin() + IdxVal, +                                                         NumElems)); +          return DAG.getBitcast(NVT, BuildVec); +        } +      } +    } +  }    if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {      // Handle only simple case where vector being inserted and vector @@ -15013,6 +15541,37 @@ static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,    return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());  } +static SDValue simplifyShuffleMask(ShuffleVectorSDNode *SVN, SDValue N0, +                                   SDValue N1, SelectionDAG &DAG) { +  auto isUndefElt = [](SDValue V, int Idx) { +    // TODO - handle more cases as required. +    if (V.getOpcode() == ISD::BUILD_VECTOR) +      return V.getOperand(Idx).isUndef(); +    if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) +      return (Idx != 0) || V.getOperand(0).isUndef(); +    return false; +  }; + +  EVT VT = SVN->getValueType(0); +  unsigned NumElts = VT.getVectorNumElements(); + +  bool Changed = false; +  SmallVector<int, 8> NewMask; +  for (unsigned i = 0; i != NumElts; ++i) { +    int Idx = SVN->getMaskElt(i); +    if ((0 <= Idx && Idx < (int)NumElts && isUndefElt(N0, Idx)) || +        ((int)NumElts < Idx && isUndefElt(N1, Idx - NumElts))) { +      Changed = true; +      Idx = -1; +    } +    NewMask.push_back(Idx); +  } +  if (Changed) +    return DAG.getVectorShuffle(VT, SDLoc(SVN), N0, N1, NewMask); + +  return SDValue(); +} +  // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,  // or turn a shuffle of a single concat into simpler shuffle then concat.  static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { @@ -15091,7 +15650,7 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {  //  // To deal with this, we currently use a bunch of mostly arbitrary heuristics.  // We don't fold shuffles where one side is a non-zero constant, and we don't -// fold shuffles if the resulting BUILD_VECTOR would have duplicate +// fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate  // non-constant operands. This seems to work out reasonably well in practice.  static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,                                         SelectionDAG &DAG, @@ -15103,6 +15662,7 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,    if (!N0->hasOneUse() || !N1->hasOneUse())      return SDValue(); +    // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as    // discussed above.    if (!N1.isUndef()) { @@ -15114,6 +15674,15 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,        return SDValue();    } +  // If both inputs are splats of the same value then we can safely merge this +  // to a single BUILD_VECTOR with undef elements based on the shuffle mask. +  bool IsSplat = false; +  auto *BV0 = dyn_cast<BuildVectorSDNode>(N0); +  auto *BV1 = dyn_cast<BuildVectorSDNode>(N1); +  if (BV0 && BV1) +    if (SDValue Splat0 = BV0->getSplatValue()) +      IsSplat = (Splat0 == BV1->getSplatValue()); +    SmallVector<SDValue, 8> Ops;    SmallSet<SDValue, 16> DuplicateOps;    for (int M : SVN->getMask()) { @@ -15124,23 +15693,25 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,        if (S.getOpcode() == ISD::BUILD_VECTOR) {          Op = S.getOperand(Idx);        } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) { -        if (Idx == 0) -          Op = S.getOperand(0); +        assert(Idx == 0 && "Unexpected SCALAR_TO_VECTOR operand index."); +        Op = S.getOperand(0);        } else {          // Operand can't be combined - bail out.          return SDValue();        }      } -    // Don't duplicate a non-constant BUILD_VECTOR operand; semantically, this is -    // fine, but it's likely to generate low-quality code if the target can't -    // reconstruct an appropriate shuffle. +    // Don't duplicate a non-constant BUILD_VECTOR operand unless we're +    // generating a splat; semantically, this is fine, but it's likely to +    // generate low-quality code if the target can't reconstruct an appropriate +    // shuffle.      if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op)) -      if (!DuplicateOps.insert(Op).second) +      if (!IsSplat && !DuplicateOps.insert(Op).second)          return SDValue();      Ops.push_back(Op);    } +    // BUILD_VECTOR requires all inputs to be of the same type, find the    // maximum type and extend them all.    EVT SVT = VT.getScalarType(); @@ -15162,7 +15733,8 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,  static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,                                              SelectionDAG &DAG,                                              const TargetLowering &TLI, -                                            bool LegalOperations) { +                                            bool LegalOperations, +                                            bool LegalTypes) {    EVT VT = SVN->getValueType(0);    bool IsBigEndian = DAG.getDataLayout().isBigEndian(); @@ -15190,14 +15762,18 @@ static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,    // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for    // power-of-2 extensions as they are the most likely.    for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { +    // Check for non power of 2 vector sizes +    if (NumElts % Scale != 0) +      continue;      if (!isAnyExtend(Scale))        continue;      EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale);      EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); -    if (!LegalOperations || -        TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) -      return DAG.getBitcast(VT, +    if (!LegalTypes || TLI.isTypeLegal(OutVT)) +      if (!LegalOperations || +          TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) +        return DAG.getBitcast(VT,                              DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT));    } @@ -15218,9 +15794,7 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,    if (!VT.isInteger() || IsBigEndian)      return SDValue(); -  SDValue N0 = SVN->getOperand(0); -  while (N0.getOpcode() == ISD::BITCAST) -    N0 = N0.getOperand(0); +  SDValue N0 = peekThroughBitcast(SVN->getOperand(0));    unsigned Opcode = N0.getOpcode();    if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && @@ -15316,6 +15890,84 @@ static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask,                                NewMask);  } +/// If the shuffle mask is taking exactly one element from the first vector +/// operand and passing through all other elements from the second vector +/// operand, return the index of the mask element that is choosing an element +/// from the first operand. Otherwise, return -1. +static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) { +  int MaskSize = Mask.size(); +  int EltFromOp0 = -1; +  // TODO: This does not match if there are undef elements in the shuffle mask. +  // Should we ignore undefs in the shuffle mask instead? The trade-off is +  // removing an instruction (a shuffle), but losing the knowledge that some +  // vector lanes are not needed. +  for (int i = 0; i != MaskSize; ++i) { +    if (Mask[i] >= 0 && Mask[i] < MaskSize) { +      // We're looking for a shuffle of exactly one element from operand 0. +      if (EltFromOp0 != -1) +        return -1; +      EltFromOp0 = i; +    } else if (Mask[i] != i + MaskSize) { +      // Nothing from operand 1 can change lanes. +      return -1; +    } +  } +  return EltFromOp0; +} + +/// If a shuffle inserts exactly one element from a source vector operand into +/// another vector operand and we can access the specified element as a scalar, +/// then we can eliminate the shuffle. +static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, +                                      SelectionDAG &DAG) { +  // First, check if we are taking one element of a vector and shuffling that +  // element into another vector. +  ArrayRef<int> Mask = Shuf->getMask(); +  SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end()); +  SDValue Op0 = Shuf->getOperand(0); +  SDValue Op1 = Shuf->getOperand(1); +  int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask); +  if (ShufOp0Index == -1) { +    // Commute mask and check again. +    ShuffleVectorSDNode::commuteMask(CommutedMask); +    ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask); +    if (ShufOp0Index == -1) +      return SDValue(); +    // Commute operands to match the commuted shuffle mask. +    std::swap(Op0, Op1); +    Mask = CommutedMask; +  } + +  // The shuffle inserts exactly one element from operand 0 into operand 1. +  // Now see if we can access that element as a scalar via a real insert element +  // instruction. +  // TODO: We can try harder to locate the element as a scalar. Examples: it +  // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant. +  assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && +         "Shuffle mask value must be from operand 0"); +  if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT) +    return SDValue(); + +  auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2)); +  if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index]) +    return SDValue(); + +  // There's an existing insertelement with constant insertion index, so we +  // don't need to check the legality/profitability of a replacement operation +  // that differs at most in the constant value. The target should be able to +  // lower any of those in a similar way. If not, legalization will expand this +  // to a scalar-to-vector plus shuffle. +  // +  // Note that the shuffle may move the scalar from the position that the insert +  // element used. Therefore, our new insert element occurs at the shuffle's +  // mask index value, not the insert's index value. +  // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' +  SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf), +                                        Op0.getOperand(2).getValueType()); +  return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), +                     Op1, Op0.getOperand(1), NewInsIndex); +} +  SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {    EVT VT = N->getValueType(0);    unsigned NumElts = VT.getVectorNumElements(); @@ -15362,6 +16014,13 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {        return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);    } +  // Simplify shuffle mask if a referenced element is UNDEF. +  if (SDValue V = simplifyShuffleMask(SVN, N0, N1, DAG)) +    return V; + +  if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG)) +    return InsElt; +    // A shuffle of a single vector that is a splat can always be folded.    if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))      if (N1->isUndef() && N0Shuf->isSplat()) @@ -15426,7 +16085,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {      return S;    // Match shuffles that can be converted to any_vector_extend_in_reg. -  if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) +  if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes))      return V;    // Combine "truncate_vector_in_reg" style shuffles. @@ -15486,7 +16145,6 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {        if (TLI.isTypeLegal(ScaleVT) &&            0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&            0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) { -          int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();          int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits(); @@ -15661,23 +16319,46 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {    EVT VT = N->getValueType(0);    // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern -  // with a VECTOR_SHUFFLE. +  // with a VECTOR_SHUFFLE and possible truncate.    if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {      SDValue InVec = InVal->getOperand(0);      SDValue EltNo = InVal->getOperand(1); - -    // FIXME: We could support implicit truncation if the shuffle can be -    // scaled to a smaller vector scalar type. -    ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo); -    if (C0 && VT == InVec.getValueType() && -        VT.getScalarType() == InVal.getValueType()) { -      SmallVector<int, 8> NewMask(VT.getVectorNumElements(), -1); +    auto InVecT = InVec.getValueType(); +    if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) { +      SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);        int Elt = C0->getZExtValue();        NewMask[0] = Elt; - -      if (TLI.isShuffleMaskLegal(NewMask, VT)) -        return DAG.getVectorShuffle(VT, SDLoc(N), InVec, DAG.getUNDEF(VT), -                                    NewMask); +      SDValue Val; +      // If we have an implict truncate do truncate here as long as it's legal. +      // if it's not legal, this should +      if (VT.getScalarType() != InVal.getValueType() && +          InVal.getValueType().isScalarInteger() && +          isTypeLegal(VT.getScalarType())) { +        Val = +            DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal); +        return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val); +      } +      if (VT.getScalarType() == InVecT.getScalarType() && +          VT.getVectorNumElements() <= InVecT.getVectorNumElements() && +          TLI.isShuffleMaskLegal(NewMask, VT)) { +        Val = DAG.getVectorShuffle(InVecT, SDLoc(N), InVec, +                                   DAG.getUNDEF(InVecT), NewMask); +        // If the initial vector is the correct size this shuffle is a +        // valid result. +        if (VT == InVecT) +          return Val; +        // If not we must truncate the vector. +        if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) { +          MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); +          SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy); +          EVT SubVT = +              EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(), +                               VT.getVectorNumElements()); +          Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, Val, +                            ZeroIdx); +          return Val; +        } +      }      }    } @@ -15694,12 +16375,47 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {    if (N1.isUndef())      return N0; +  // For nested INSERT_SUBVECTORs, attempt to combine inner node first to allow +  // us to pull BITCASTs from input to output. +  if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR) +    if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode())) +      return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2); +    // If this is an insert of an extracted vector into an undef vector, we can    // just use the input to the extract.    if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&        N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)      return N1.getOperand(0); +  // If we are inserting a bitcast value into an undef, with the same +  // number of elements, just use the bitcast input of the extract. +  // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 -> +  //        BITCAST (INSERT_SUBVECTOR UNDEF N1 N2) +  if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST && +      N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && +      N1.getOperand(0).getOperand(1) == N2 && +      N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() == +          VT.getVectorNumElements()) { +    return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0)); +  } + +  // If both N1 and N2 are bitcast values on which insert_subvector +  // would makes sense, pull the bitcast through. +  // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 -> +  //        BITCAST (INSERT_SUBVECTOR N0 N1 N2) +  if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) { +    SDValue CN0 = N0.getOperand(0); +    SDValue CN1 = N1.getOperand(0); +    if (CN0.getValueType().getVectorElementType() == +            CN1.getValueType().getVectorElementType() && +        CN0.getValueType().getVectorNumElements() == +            VT.getVectorNumElements()) { +      SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), +                                      CN0.getValueType(), CN0, CN1, N2); +      return DAG.getBitcast(VT, NewINSERT); +    } +  } +    // Combine INSERT_SUBVECTORs where we are inserting to the same index.    // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )    // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) @@ -15779,7 +16495,7 @@ SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {  SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {    EVT VT = N->getValueType(0);    SDValue LHS = N->getOperand(0); -  SDValue RHS = N->getOperand(1); +  SDValue RHS = peekThroughBitcast(N->getOperand(1));    SDLoc DL(N);    // Make sure we're not running after operation legalization where it @@ -15790,9 +16506,6 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {    if (N->getOpcode() != ISD::AND)      return SDValue(); -  if (RHS.getOpcode() == ISD::BITCAST) -    RHS = RHS.getOperand(0); -    if (RHS.getOpcode() != ISD::BUILD_VECTOR)      return SDValue(); @@ -15945,7 +16658,6 @@ SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,  /// the DAG combiner loop to avoid it being looked at.  bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,                                      SDValue RHS) { -    // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))    // The select + setcc is redundant, because fsqrt returns NaN for X < 0.    if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) { @@ -16418,7 +17130,7 @@ SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,  SDValue DAGCombiner::BuildSDIV(SDNode *N) {    // when optimising for minimum size, we don't want to expand a div to a mul    // and a shift. -  if (DAG.getMachineFunction().getFunction()->optForMinSize()) +  if (DAG.getMachineFunction().getFunction().optForMinSize())      return SDValue();    ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); @@ -16429,7 +17141,7 @@ SDValue DAGCombiner::BuildSDIV(SDNode *N) {    if (C->isNullValue())      return SDValue(); -  std::vector<SDNode*> Built; +  std::vector<SDNode *> Built;    SDValue S =        TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); @@ -16464,7 +17176,7 @@ SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {  SDValue DAGCombiner::BuildUDIV(SDNode *N) {    // when optimising for minimum size, we don't want to expand a div to a mul    // and a shift. -  if (DAG.getMachineFunction().getFunction()->optForMinSize()) +  if (DAG.getMachineFunction().getFunction().optForMinSize())      return SDValue();    ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); @@ -16475,7 +17187,7 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {    if (C->isNullValue())      return SDValue(); -  std::vector<SDNode*> Built; +  std::vector<SDNode *> Built;    SDValue S =        TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); @@ -16760,8 +17472,8 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {    if (Op1->isInvariant() && Op0->writeMem())      return false; -  unsigned NumBytes0 = Op0->getMemoryVT().getSizeInBits() >> 3; -  unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3; +  unsigned NumBytes0 = Op0->getMemoryVT().getStoreSize(); +  unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize();    // Check for BaseIndexOffset matching.    BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG); @@ -16957,7 +17669,11 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,  /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain  /// (aliasing node.)  SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { -  SmallVector<SDValue, 8> Aliases;  // Ops for replacing token factor. +  if (OptLevel == CodeGenOpt::None) +    return OldChain; + +  // Ops for replacing token factor. +  SmallVector<SDValue, 8> Aliases;    // Accumulate all the aliases to this node.    GatherAllAliases(N, OldChain, Aliases); @@ -16987,6 +17703,9 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {  // to go from a partially-merged state to the desired final  // fully-merged state.  bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { +  if (OptLevel == CodeGenOpt::None) +    return false; +    // This holds the base pointer, index, and the offset in bytes from the base    // pointer.    BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); | 
