diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:10:56 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:10:56 +0000 |
commit | 044eb2f6afba375a914ac9d8024f8f5142bb912e (patch) | |
tree | 1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/CodeGen/SelectionDAG/DAGCombiner.cpp | |
parent | eb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff) |
Notes
Diffstat (limited to 'lib/CodeGen/SelectionDAG/DAGCombiner.cpp')
-rw-r--r-- | lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2497 |
1 files changed, 1608 insertions, 889 deletions
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 432c86dd6f1e1..f97732c1c49d0 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1,4 +1,4 @@ -//===-- DAGCombiner.cpp - Implement a DAG node combiner -------------------===// +//===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===// // // The LLVM Compiler Infrastructure // @@ -16,32 +16,64 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/CodeGen/DAGCombine.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> +#include <cassert> +#include <cstdint> +#include <functional> +#include <iterator> +#include <string> +#include <tuple> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "dagcombine" @@ -53,43 +85,41 @@ STATISTIC(OpsNarrowed , "Number of load/op/store narrowed"); STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int"); STATISTIC(SlicedLoads, "Number of load sliced"); -namespace { - static cl::opt<bool> - CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, - cl::desc("Enable DAG combiner's use of IR alias analysis")); +static cl::opt<bool> +CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, + cl::desc("Enable DAG combiner's use of IR alias analysis")); - static cl::opt<bool> - UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true), - cl::desc("Enable DAG combiner's use of TBAA")); +static cl::opt<bool> +UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true), + cl::desc("Enable DAG combiner's use of TBAA")); #ifndef NDEBUG - static cl::opt<std::string> - CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden, - cl::desc("Only use DAG-combiner alias analysis in this" - " function")); +static cl::opt<std::string> +CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden, + cl::desc("Only use DAG-combiner alias analysis in this" + " function")); #endif - /// Hidden option to stress test load slicing, i.e., when this option - /// is enabled, load slicing bypasses most of its profitability guards. - static cl::opt<bool> - StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, - cl::desc("Bypass the profitability model of load " - "slicing"), - cl::init(false)); +/// Hidden option to stress test load slicing, i.e., when this option +/// is enabled, load slicing bypasses most of its profitability guards. +static cl::opt<bool> +StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, + cl::desc("Bypass the profitability model of load slicing"), + cl::init(false)); - static cl::opt<bool> - MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), - cl::desc("DAG combiner may split indexing from loads")); +static cl::opt<bool> + MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), + cl::desc("DAG combiner may split indexing from loads")); -//------------------------------ DAGCombiner ---------------------------------// +namespace { class DAGCombiner { SelectionDAG &DAG; const TargetLowering &TLI; CombineLevel Level; CodeGenOpt::Level OptLevel; - bool LegalOperations; - bool LegalTypes; + bool LegalOperations = false; + bool LegalTypes = false; bool ForCodeSize; /// \brief Worklist of all of the nodes that need to be simplified. @@ -128,6 +158,19 @@ namespace { SDValue visit(SDNode *N); public: + DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) + : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), + OptLevel(OL), AA(AA) { + ForCodeSize = DAG.getMachineFunction().getFunction().optForSize(); + + MaximumLegalStoreInBits = 0; + for (MVT VT : MVT::all_valuetypes()) + if (EVT(VT).isSimple() && VT != MVT::Other && + TLI.isTypeLegal(EVT(VT)) && + VT.getSizeInBits() >= MaximumLegalStoreInBits) + MaximumLegalStoreInBits = VT.getSizeInBits(); + } + /// Add to the worklist making sure its instance is at the back (next to be /// processed.) void AddToWorklist(SDNode *N) { @@ -285,7 +328,7 @@ namespace { SDValue visitSIGN_EXTEND(SDNode *N); SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); - SDValue visitAssertZext(SDNode *N); + SDValue visitAssertExt(SDNode *N); SDValue visitSIGN_EXTEND_INREG(SDNode *N); SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); @@ -348,6 +391,7 @@ namespace { SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt); SDValue foldSelectOfConstants(SDNode *N); + SDValue foldVSelectOfConstants(SDNode *N); SDValue foldBinOpIntoSelect(SDNode *BO); bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N); @@ -371,6 +415,7 @@ namespace { SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); SDValue CombineExtLoad(SDNode *N); SDValue combineRepeatedFPDivisors(SDNode *N); + SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); @@ -400,14 +445,11 @@ namespace { SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N); SDValue reduceBuildVecToShuffle(SDNode *N); - SDValue reduceBuildVecToTrunc(SDNode *N); SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef<int> VectorMask, SDValue VecIn1, SDValue VecIn2, unsigned LeftIdx); SDValue matchVSelectOpSizesWithSetCC(SDNode *N); - SDValue GetDemandedBits(SDValue V, const APInt &Mask); - /// Walk up chain skipping non-aliasing memory nodes, /// looking for aliasing nodes and adding them to the Aliases vector. void GatherAllAliases(SDNode *N, SDValue OriginalChain, @@ -434,12 +476,14 @@ namespace { /// Holds a pointer to an LSBaseSDNode as well as information on where it /// is located in a sequence of memory operations connected by a chain. struct MemOpLink { - MemOpLink(LSBaseSDNode *N, int64_t Offset) - : MemNode(N), OffsetFromBase(Offset) {} // Ptr to the mem node. LSBaseSDNode *MemNode; + // Offset from the base ptr. int64_t OffsetFromBase; + + MemOpLink(LSBaseSDNode *N, int64_t Offset) + : MemNode(N), OffsetFromBase(Offset) {} }; /// This is a helper function for visitMUL to check the profitability @@ -450,38 +494,49 @@ namespace { SDValue &AddNode, SDValue &ConstNode); - /// This is a helper function for visitAND and visitZERO_EXTEND. Returns /// true if the (and (load x) c) pattern matches an extload. ExtVT returns - /// the type of the loaded value to be extended. LoadedVT returns the type - /// of the original loaded value. NarrowLoad returns whether the load would - /// need to be narrowed in order to match. + /// the type of the loaded value to be extended. bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, - EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT, - bool &NarrowLoad); + EVT LoadResultTy, EVT &ExtVT); + + /// Helper function to calculate whether the given Load can have its + /// width reduced to ExtVT. + bool isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType, + EVT &ExtVT, unsigned ShAmt = 0); + + /// Used by BackwardsPropagateMask to find suitable loads. + bool SearchForAndLoads(SDNode *N, SmallPtrSetImpl<LoadSDNode*> &Loads, + SmallPtrSetImpl<SDNode*> &NodeWithConsts, + ConstantSDNode *Mask, SDNode *&UncombinedNode); + /// Attempt to propagate a given AND node back to load leaves so that they + /// can be combined into narrow loads. + bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG); /// Helper function for MergeConsecutiveStores which merges the /// component store chains. SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores); - /// This is a helper function for MergeConsecutiveStores. When the source - /// elements of the consecutive stores are all constants or all extracted - /// vector elements, try to merge them into one larger store. - /// \return True if a merged store was created. + /// This is a helper function for MergeConsecutiveStores. When the + /// source elements of the consecutive stores are all constants or + /// all extracted vector elements, try to merge them into one + /// larger store introducing bitcasts if necessary. \return True + /// if a merged store was created. bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores, bool IsConstantSrc, bool UseVector, bool UseTrunc); - /// This is a helper function for MergeConsecutiveStores. - /// Stores that may be merged are placed in StoreNodes. + /// This is a helper function for MergeConsecutiveStores. Stores + /// that potentially may be merged with St are placed in + /// StoreNodes. void getStoreMergeCandidates(StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes); /// Helper function for MergeConsecutiveStores. Checks if - /// Candidate stores have indirect dependency through their - /// operands. \return True if safe to merge + /// candidate stores have indirect dependency through their + /// operands. \return True if safe to merge. bool checkMergeStoreCandidatesForDependencies( SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores); @@ -500,19 +555,6 @@ namespace { SDValue distributeTruncateThroughAnd(SDNode *N); public: - DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) - : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), - OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) { - ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize(); - - MaximumLegalStoreInBits = 0; - for (MVT VT : MVT::all_valuetypes()) - if (EVT(VT).isSimple() && VT != MVT::Other && - TLI.isTypeLegal(EVT(VT)) && - VT.getSizeInBits() >= MaximumLegalStoreInBits) - MaximumLegalStoreInBits = VT.getSizeInBits(); - } - /// Runs the dag combiner on all nodes in the work list void Run(CombineLevel AtLevel); @@ -541,14 +583,12 @@ namespace { return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); } }; -} - -namespace { /// This class is a DAGUpdateListener that removes any deleted /// nodes from the worklist. class WorklistRemover : public SelectionDAG::DAGUpdateListener { DAGCombiner &DC; + public: explicit WorklistRemover(DAGCombiner &dc) : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} @@ -557,7 +597,8 @@ public: DC.removeFromWorklist(N); } }; -} + +} // end anonymous namespace //===----------------------------------------------------------------------===// // TargetLowering::DAGCombinerInfo implementation @@ -577,7 +618,6 @@ CombineTo(SDNode *N, SDValue Res, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo); } - SDValue TargetLowering::DAGCombinerInfo:: CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo); @@ -873,6 +913,56 @@ static bool isAnyConstantBuildVector(const SDNode *N) { ISD::isBuildVectorOfConstantFPSDNodes(N); } +// Attempt to match a unary predicate against a scalar/splat constant or +// every element of a constant BUILD_VECTOR. +static bool matchUnaryPredicate(SDValue Op, + std::function<bool(ConstantSDNode *)> Match) { + if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) + return Match(Cst); + + if (ISD::BUILD_VECTOR != Op.getOpcode()) + return false; + + EVT SVT = Op.getValueType().getScalarType(); + for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { + auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i)); + if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst)) + return false; + } + return true; +} + +// Attempt to match a binary predicate against a pair of scalar/splat constants +// or every element of a pair of constant BUILD_VECTORs. +static bool matchBinaryPredicate( + SDValue LHS, SDValue RHS, + std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match) { + if (LHS.getValueType() != RHS.getValueType()) + return false; + + if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS)) + if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS)) + return Match(LHSCst, RHSCst); + + if (ISD::BUILD_VECTOR != LHS.getOpcode() || + ISD::BUILD_VECTOR != RHS.getOpcode()) + return false; + + EVT SVT = LHS.getValueType().getScalarType(); + for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) { + auto *LHSCst = dyn_cast<ConstantSDNode>(LHS.getOperand(i)); + auto *RHSCst = dyn_cast<ConstantSDNode>(RHS.getOperand(i)); + if (!LHSCst || !RHSCst) + return false; + if (LHSCst->getValueType(0) != SVT || + LHSCst->getValueType(0) != RHSCst->getValueType(0)) + return false; + if (!Match(LHSCst, RHSCst)) + return false; + } + return true; +} + SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1) { EVT VT = N0.getValueType(); @@ -1123,10 +1213,10 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { Replace0 &= !N0->hasOneUse(); Replace1 &= (N0 != N1) && !N1->hasOneUse(); - // Combine Op here so it is presreved past replacements. + // Combine Op here so it is preserved past replacements. CombineTo(Op.getNode(), RV); - // If operands have a use ordering, make sur we deal with + // If operands have a use ordering, make sure we deal with // predecessor first. if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) { std::swap(N0, N1); @@ -1473,7 +1563,8 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); - case ISD::AssertZext: return visitAssertZext(N); + case ISD::AssertSext: + case ISD::AssertZext: return visitAssertExt(N); case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); @@ -1572,15 +1663,15 @@ SDValue DAGCombiner::combine(SDNode *N) { } } - // If N is a commutative binary node, try commuting it to enable more - // sdisel CSE. + // If N is a commutative binary node, try eliminate it if the commuted + // version is already present in the DAG. if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) && N->getNumValues() == 1) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Constant operands are canonicalized to RHS. - if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) { + if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) { SDValue Ops[] = {N1, N0}; SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops, N->getFlags()); @@ -1632,7 +1723,6 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { // Check each of the operands. for (const SDValue &Op : TF->op_values()) { - switch (Op.getOpcode()) { case ISD::EntryToken: // Entry tokens don't need to be added to the list. They are @@ -1907,6 +1997,15 @@ SDValue DAGCombiner::visitADD(SDNode *N) { return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not); } } + + // Undo the add -> or combine to merge constant offsets from a frame index. + if (N0.getOpcode() == ISD::OR && + isa<FrameIndexSDNode>(N0.getOperand(0)) && + isa<ConstantSDNode>(N0.getOperand(1)) && + DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) { + SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1)); + return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0); + } } if (SDValue NewSel = foldBinOpIntoSelect(N)) @@ -2064,7 +2163,8 @@ SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) } // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) - if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) + if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) && + N1.getResNo() == 0) return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), N0, N1.getOperand(0), N1.getOperand(2)); @@ -2537,6 +2637,12 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0); N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1); + assert((!N0IsConst || + ConstValue0.getBitWidth() == VT.getScalarSizeInBits()) && + "Splat APInt should be element width"); + assert((!N1IsConst || + ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) && + "Splat APInt should be element width"); } else { N0IsConst = isa<ConstantSDNode>(N0); if (N0IsConst) { @@ -2562,12 +2668,8 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { // fold (mul x, 0) -> 0 if (N1IsConst && ConstValue1.isNullValue()) return N1; - // We require a splat of the entire scalar bit width for non-contiguous - // bit patterns. - bool IsFullSplat = - ConstValue1.getBitWidth() == VT.getScalarSizeInBits(); // fold (mul x, 1) -> x - if (N1IsConst && ConstValue1.isOneValue() && IsFullSplat) + if (N1IsConst && ConstValue1.isOneValue()) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) @@ -2580,16 +2682,20 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { DAG.getConstant(0, DL, VT), N0); } // fold (mul x, (1 << c)) -> x << c - if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isPowerOf2() && - IsFullSplat) { + if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && + DAG.isKnownToBeAPowerOfTwo(N1) && + (!VT.isVector() || Level <= AfterLegalizeVectorOps)) { SDLoc DL(N); - return DAG.getNode(ISD::SHL, DL, VT, N0, - DAG.getConstant(ConstValue1.logBase2(), DL, - getShiftAmountTy(N0.getValueType()))); + SDValue LogBase2 = BuildLogBase2(N1, DL); + AddToWorklist(LogBase2.getNode()); + + EVT ShiftVT = getShiftAmountTy(N0.getValueType()); + SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); + AddToWorklist(Trunc.getNode()); + return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc); } // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c - if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2() && - IsFullSplat) { + if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) { unsigned Log2Val = (-ConstValue1).logBase2(); SDLoc DL(N); // FIXME: If the input is something that is easily negated (e.g. a @@ -2835,7 +2941,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { // If integer divide is expensive and we satisfy the requirements, emit an // alternate sequence. Targets may check function attributes for size/speed // trade-offs. - AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildSDIV(N)) return Op; @@ -2906,7 +3012,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { } // fold (udiv x, c) -> alternate - AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildUDIV(N)) return Op; @@ -2965,7 +3071,7 @@ SDValue DAGCombiner::visitREM(SDNode *N) { } } - AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. @@ -3003,19 +3109,26 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); + if (VT.isVector()) { + // fold (mulhs x, 0) -> 0 + if (ISD::isBuildVectorAllZeros(N1.getNode())) + return N1; + if (ISD::isBuildVectorAllZeros(N0.getNode())) + return N0; + } + // fold (mulhs x, 0) -> 0 if (isNullConstant(N1)) return N1; // fold (mulhs x, 1) -> (sra x, size(x)-1) - if (isOneConstant(N1)) { - SDLoc DL(N); + if (isOneConstant(N1)) return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0, DAG.getConstant(N0.getValueSizeInBits() - 1, DL, getShiftAmountTy(N0.getValueType()))); - } + // fold (mulhs x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) - return DAG.getConstant(0, SDLoc(N), VT); + return DAG.getConstant(0, DL, VT); // If the type twice as wide is legal, transform the mulhs to a wider multiply // plus a shift. @@ -3043,6 +3156,14 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); + if (VT.isVector()) { + // fold (mulhu x, 0) -> 0 + if (ISD::isBuildVectorAllZeros(N1.getNode())) + return N1; + if (ISD::isBuildVectorAllZeros(N0.getNode())) + return N0; + } + // fold (mulhu x, 0) -> 0 if (isNullConstant(N1)) return N1; @@ -3216,7 +3337,7 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; - // fold (add c1, c2) -> c1+c2 + // fold operation with constant operands. ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); if (N0C && N1C) @@ -3599,22 +3720,20 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { } bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, - EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT, - bool &NarrowLoad) { - uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits(); - - if (ActiveBits == 0 || !AndC->getAPIntValue().isMask(ActiveBits)) + EVT LoadResultTy, EVT &ExtVT) { + if (!AndC->getAPIntValue().isMask()) return false; + unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); + ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); - LoadedVT = LoadN->getMemoryVT(); + EVT LoadedVT = LoadN->getMemoryVT(); if (ExtVT == LoadedVT && (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) { // ZEXTLOAD will match without needing to change the size of the value being // loaded. - NarrowLoad = false; return true; } @@ -3634,10 +3753,185 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT)) return false; - NarrowLoad = true; return true; } +bool DAGCombiner::isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType, + EVT &ExtVT, unsigned ShAmt) { + // Don't transform one with multiple uses, this would require adding a new + // load. + if (!SDValue(LoadN, 0).hasOneUse()) + return false; + + if (LegalOperations && + !TLI.isLoadExtLegal(ExtType, LoadN->getValueType(0), ExtVT)) + return false; + + // Do not generate loads of non-round integer types since these can + // be expensive (and would be wrong if the type is not byte sized). + if (!ExtVT.isRound()) + return false; + + // Don't change the width of a volatile load. + if (LoadN->isVolatile()) + return false; + + // Verify that we are actually reducing a load width here. + if (LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits()) + return false; + + // For the transform to be legal, the load must produce only two values + // (the value loaded and the chain). Don't transform a pre-increment + // load, for example, which produces an extra value. Otherwise the + // transformation is not equivalent, and the downstream logic to replace + // uses gets things wrong. + if (LoadN->getNumValues() > 2) + return false; + + // If the load that we're shrinking is an extload and we're not just + // discarding the extension we can't simply shrink the load. Bail. + // TODO: It would be possible to merge the extensions in some cases. + if (LoadN->getExtensionType() != ISD::NON_EXTLOAD && + LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt) + return false; + + if (!TLI.shouldReduceLoadWidth(LoadN, ExtType, ExtVT)) + return false; + + // It's not possible to generate a constant of extended or untyped type. + EVT PtrType = LoadN->getOperand(1).getValueType(); + if (PtrType == MVT::Untyped || PtrType.isExtended()) + return false; + + return true; +} + +bool DAGCombiner::SearchForAndLoads(SDNode *N, + SmallPtrSetImpl<LoadSDNode*> &Loads, + SmallPtrSetImpl<SDNode*> &NodesWithConsts, + ConstantSDNode *Mask, + SDNode *&NodeToMask) { + // Recursively search for the operands, looking for loads which can be + // narrowed. + for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) { + SDValue Op = N->getOperand(i); + + if (Op.getValueType().isVector()) + return false; + + // Some constants may need fixing up later if they are too large. + if (auto *C = dyn_cast<ConstantSDNode>(Op)) { + if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) && + (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue()) + NodesWithConsts.insert(N); + continue; + } + + if (!Op.hasOneUse()) + return false; + + switch(Op.getOpcode()) { + case ISD::LOAD: { + auto *Load = cast<LoadSDNode>(Op); + EVT ExtVT; + if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) && + isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) { + // Only add this load if we can make it more narrow. + if (ExtVT.bitsLT(Load->getMemoryVT())) + Loads.insert(Load); + continue; + } + return false; + } + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + case ISD::AssertZext: { + unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); + EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); + EVT VT = Op.getOpcode() == ISD::AssertZext ? + cast<VTSDNode>(Op.getOperand(1))->getVT() : + Op.getOperand(0).getValueType(); + + // We can accept extending nodes if the mask is wider or an equal + // width to the original type. + if (ExtVT.bitsGE(VT)) + continue; + break; + } + case ISD::OR: + case ISD::XOR: + case ISD::AND: + if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask, + NodeToMask)) + return false; + continue; + } + + // Allow one node which will masked along with any loads found. + if (NodeToMask) + return false; + NodeToMask = Op.getNode(); + } + return true; +} + +bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) { + auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!Mask) + return false; + + if (!Mask->getAPIntValue().isMask()) + return false; + + // No need to do anything if the and directly uses a load. + if (isa<LoadSDNode>(N->getOperand(0))) + return false; + + SmallPtrSet<LoadSDNode*, 8> Loads; + SmallPtrSet<SDNode*, 2> NodesWithConsts; + SDNode *FixupNode = nullptr; + if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) { + if (Loads.size() == 0) + return false; + + SDValue MaskOp = N->getOperand(1); + + // If it exists, fixup the single node we allow in the tree that needs + // masking. + if (FixupNode) { + SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), + FixupNode->getValueType(0), + SDValue(FixupNode, 0), MaskOp); + DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And); + DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), + MaskOp); + } + + // Narrow any constants that need it. + for (auto *LogicN : NodesWithConsts) { + auto *C = cast<ConstantSDNode>(LogicN->getOperand(1)); + SDValue And = DAG.getNode(ISD::AND, SDLoc(C), C->getValueType(0), + SDValue(C, 0), MaskOp); + DAG.UpdateNodeOperands(LogicN, LogicN->getOperand(0), And); + } + + // Create narrow loads. + for (auto *Load : Loads) { + SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), + SDValue(Load, 0), MaskOp); + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); + DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp); + SDValue NewLoad = ReduceLoadWidth(And.getNode()); + assert(NewLoad && + "Shouldn't be masking the load if it can't be narrowed"); + CombineTo(Load, NewLoad, NewLoad.getValue(1)); + } + DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode()); + return true; + } + return false; +} + SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -3829,55 +4123,23 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD || (N0.getOpcode() == ISD::ANY_EXTEND && N0.getOperand(0).getOpcode() == ISD::LOAD))) { - bool HasAnyExt = N0.getOpcode() == ISD::ANY_EXTEND; - LoadSDNode *LN0 = HasAnyExt - ? cast<LoadSDNode>(N0.getOperand(0)) - : cast<LoadSDNode>(N0); - if (LN0->getExtensionType() != ISD::SEXTLOAD && - LN0->isUnindexed() && N0.hasOneUse() && SDValue(LN0, 0).hasOneUse()) { - auto NarrowLoad = false; - EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT; - EVT ExtVT, LoadedVT; - if (isAndLoadExtLoad(N1C, LN0, LoadResultTy, ExtVT, LoadedVT, - NarrowLoad)) { - if (!NarrowLoad) { - SDValue NewLoad = - DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, - LN0->getChain(), LN0->getBasePtr(), ExtVT, - LN0->getMemOperand()); - AddToWorklist(N); - CombineTo(LN0, NewLoad, NewLoad.getValue(1)); - return SDValue(N, 0); // Return N so it doesn't get rechecked! - } else { - EVT PtrType = LN0->getOperand(1).getValueType(); - - unsigned Alignment = LN0->getAlignment(); - SDValue NewPtr = LN0->getBasePtr(); - - // For big endian targets, we need to add an offset to the pointer - // to load the correct bytes. For little endian systems, we merely - // need to read fewer bytes from the same pointer. - if (DAG.getDataLayout().isBigEndian()) { - unsigned LVTStoreBytes = LoadedVT.getStoreSize(); - unsigned EVTStoreBytes = ExtVT.getStoreSize(); - unsigned PtrOff = LVTStoreBytes - EVTStoreBytes; - SDLoc DL(LN0); - NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, - NewPtr, DAG.getConstant(PtrOff, DL, PtrType)); - Alignment = MinAlign(Alignment, PtrOff); - } + if (SDValue Res = ReduceLoadWidth(N)) { + LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND + ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0); - AddToWorklist(NewPtr.getNode()); + AddToWorklist(N); + CombineTo(LN0, Res, Res.getValue(1)); + return SDValue(N, 0); + } + } - SDValue Load = DAG.getExtLoad( - ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, LN0->getChain(), NewPtr, - LN0->getPointerInfo(), ExtVT, Alignment, - LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); - AddToWorklist(N); - CombineTo(LN0, Load, Load.getValue(1)); - return SDValue(N, 0); // Return N so it doesn't get rechecked! - } - } + if (Level >= AfterLegalizeTypes) { + // Attempt to propagate the AND back up to the leaves which, if they're + // loads, can be combined to narrow loads and the AND node can be removed. + // Perform after legalization so that extend nodes will already be + // combined into the loads. + if (BackwardsPropagateMask(N, DAG)) { + return SDValue(N, 0); } } @@ -3974,7 +4236,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) return SDValue(); - // Recognize (and (shl a, 8), 0xff), (and (srl a, 8), 0xff00) + // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff) bool LookPassAnd0 = false; bool LookPassAnd1 = false; if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL) @@ -4593,20 +4855,6 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, return nullptr; } -// if Left + Right == Sum (constant or constant splat vector) -static bool sumMatchConstant(SDValue Left, SDValue Right, unsigned Sum, - SelectionDAG &DAG, const SDLoc &DL) { - EVT ShiftVT = Left.getValueType(); - if (ShiftVT != Right.getValueType()) return false; - - SDValue ShiftSum = DAG.FoldConstantArithmetic(ISD::ADD, DL, ShiftVT, - Left.getNode(), Right.getNode()); - if (!ShiftSum) return false; - - ConstantSDNode *CSum = isConstOrConstSplat(ShiftSum); - return CSum && CSum->getZExtValue() == Sum; -} - // MatchRotate - Handle an 'or' of two operands. If this is one of the many // idioms for rotate, and if the target supports rotation instructions, generate // a rot[lr]. @@ -4620,6 +4868,16 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT); if (!HasROTL && !HasROTR) return nullptr; + // Check for truncated rotate. + if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE && + LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) { + assert(LHS.getValueType() == RHS.getValueType()); + if (SDNode *Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) { + return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), + SDValue(Rot, 0)).getNode(); + } + } + // Match "(X shl/srl V1) & V2" where V2 may not be present. SDValue LHSShift; // The shift. SDValue LHSMask; // AND value if any. @@ -4652,7 +4910,11 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) - if (sumMatchConstant(LHSShiftAmt, RHSShiftAmt, EltSizeInBits, DAG, DL)) { + auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits; + }; + if (matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) { SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); @@ -4712,20 +4974,22 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { } namespace { + /// Represents known origin of an individual byte in load combine pattern. The /// value of the byte is either constant zero or comes from memory. struct ByteProvider { // For constant zero providers Load is set to nullptr. For memory providers // Load represents the node which loads the byte from memory. // ByteOffset is the offset of the byte in the value produced by the load. - LoadSDNode *Load; - unsigned ByteOffset; + LoadSDNode *Load = nullptr; + unsigned ByteOffset = 0; - ByteProvider() : Load(nullptr), ByteOffset(0) {} + ByteProvider() = default; static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { return ByteProvider(Load, ByteOffset); } + static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } bool isConstantZero() const { return !Load; } @@ -4740,6 +5004,8 @@ private: : Load(Load), ByteOffset(ByteOffset) {} }; +} // end anonymous namespace + /// Recursively traverses the expression calculating the origin of the requested /// byte of the given value. Returns None if the provider can't be calculated. /// @@ -4751,9 +5017,9 @@ private: /// Because the parts of the expression are not allowed to have more than one /// use this function iterates over trees, not DAGs. So it never visits the same /// node more than once. -const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index, - unsigned Depth, - bool Root = false) { +static const Optional<ByteProvider> +calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, + bool Root = false) { // Typical i64 by i8 pattern requires recursion up to 8 calls depth if (Depth == 10) return None; @@ -4837,7 +5103,6 @@ const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index, return None; } -} // namespace /// Match a pattern where a wide type scalar value is loaded by several narrow /// loads and combined by shifts and ors. Fold it into a single load or a load @@ -4950,7 +5215,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { Loads.insert(L); } - assert(Loads.size() > 0 && "All the bytes of the value must be loaded from " + assert(!Loads.empty() && "All the bytes of the value must be loaded from " "memory, so there must be at least one load which produces the value"); assert(Base && "Base address of the accessed memory location must be set"); assert(FirstOffset != INT64_MAX && "First byte offset must be set"); @@ -5373,7 +5638,11 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (shl x, c >= size(x)) -> undef - if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) + // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. + auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { + return Val->getAPIntValue().uge(OpSizeInBits); + }; + if (matchUnaryPredicate(N1, MatchShiftTooBig)) return DAG.getUNDEF(VT); // fold (shl x, 0) -> x if (N1C && N1C->isNullValue()) @@ -5400,20 +5669,29 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { return SDValue(N, 0); // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2)) - if (N1C && N0.getOpcode() == ISD::SHL) { - if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { - SDLoc DL(N); - APInt c1 = N0C1->getAPIntValue(); - APInt c2 = N1C->getAPIntValue(); + if (N0.getOpcode() == ISD::SHL) { + auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + APInt c1 = LHS->getAPIntValue(); + APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); + return (c1 + c2).uge(OpSizeInBits); + }; + if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) + return DAG.getConstant(0, SDLoc(N), VT); - APInt Sum = c1 + c2; - if (Sum.uge(OpSizeInBits)) - return DAG.getConstant(0, DL, VT); - - return DAG.getNode( - ISD::SHL, DL, VT, N0.getOperand(0), - DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType())); + auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + APInt c1 = LHS->getAPIntValue(); + APInt c2 = RHS->getAPIntValue(); + zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); + return (c1 + c2).ult(OpSizeInBits); + }; + if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { + SDLoc DL(N); + EVT ShiftVT = N1.getValueType(); + SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); + return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum); } } @@ -5527,16 +5805,18 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { } // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) + // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) // Variant of version done on multiply, except mul by a power of 2 is turned // into a shift. - if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() && + if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) && + N0.getNode()->hasOneUse() && isConstantOrConstantVector(N1, /* No Opaques */ true) && isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) { SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1); SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); AddToWorklist(Shl0.getNode()); AddToWorklist(Shl1.getNode()); - return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1); + return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1); } // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) @@ -5579,7 +5859,11 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C); // fold (sra x, c >= size(x)) -> undef - if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) + // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. + auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { + return Val->getAPIntValue().uge(OpSizeInBits); + }; + if (matchUnaryPredicate(N1, MatchShiftTooBig)) return DAG.getUNDEF(VT); // fold (sra x, 0) -> x if (N1C && N1C->isNullValue()) @@ -5603,20 +5887,31 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { } // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) - if (N1C && N0.getOpcode() == ISD::SRA) { - if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { - SDLoc DL(N); - APInt c1 = N0C1->getAPIntValue(); - APInt c2 = N1C->getAPIntValue(); - zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); - - APInt Sum = c1 + c2; - if (Sum.uge(OpSizeInBits)) - Sum = APInt(OpSizeInBits, OpSizeInBits - 1); + if (N0.getOpcode() == ISD::SRA) { + SDLoc DL(N); + EVT ShiftVT = N1.getValueType(); - return DAG.getNode( - ISD::SRA, DL, VT, N0.getOperand(0), - DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType())); + auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + APInt c1 = LHS->getAPIntValue(); + APInt c2 = RHS->getAPIntValue(); + zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); + return (c1 + c2).uge(OpSizeInBits); + }; + if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) + return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), + DAG.getConstant(OpSizeInBits - 1, DL, ShiftVT)); + + auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + APInt c1 = LHS->getAPIntValue(); + APInt c2 = RHS->getAPIntValue(); + zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); + return (c1 + c2).ult(OpSizeInBits); + }; + if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { + SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); + return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), Sum); } } @@ -5647,7 +5942,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) && TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) && TLI.isTruncateFree(VT, TruncVT)) { - SDLoc DL(N); SDValue Amt = DAG.getConstant(ShiftAmt, DL, getShiftAmountTy(N0.getOperand(0).getValueType())); @@ -5697,7 +5991,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); - // If the sign bit is known to be zero, switch this to a SRL. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1); @@ -5730,7 +6023,11 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (srl x, c >= size(x)) -> undef - if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) + // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. + auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { + return Val->getAPIntValue().uge(OpSizeInBits); + }; + if (matchUnaryPredicate(N1, MatchShiftTooBig)) return DAG.getUNDEF(VT); // fold (srl x, 0) -> x if (N1C && N1C->isNullValue()) @@ -5745,20 +6042,29 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { return DAG.getConstant(0, SDLoc(N), VT); // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2)) - if (N1C && N0.getOpcode() == ISD::SRL) { - if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { - SDLoc DL(N); - APInt c1 = N0C1->getAPIntValue(); - APInt c2 = N1C->getAPIntValue(); + if (N0.getOpcode() == ISD::SRL) { + auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + APInt c1 = LHS->getAPIntValue(); + APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); + return (c1 + c2).uge(OpSizeInBits); + }; + if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) + return DAG.getConstant(0, SDLoc(N), VT); - APInt Sum = c1 + c2; - if (Sum.uge(OpSizeInBits)) - return DAG.getConstant(0, DL, VT); - - return DAG.getNode( - ISD::SRL, DL, VT, N0.getOperand(0), - DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType())); + auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + APInt c1 = LHS->getAPIntValue(); + APInt c2 = RHS->getAPIntValue(); + zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); + return (c1 + c2).ult(OpSizeInBits); + }; + if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { + SDLoc DL(N); + EVT ShiftVT = N1.getValueType(); + SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); + return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum); } } @@ -6008,7 +6314,6 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) { return SDValue(); } - /// \brief Generate Min/Max node static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, @@ -6096,7 +6401,7 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { // For any constants that differ by 1, we can transform the select into an // extend and add. Use a target hook because some targets may prefer to // transform in the other direction. - if (TLI.convertSelectOfConstantsToMath()) { + if (TLI.convertSelectOfConstantsToMath(VT)) { if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) { // select Cond, C1, C1-1 --> add (zext Cond), C1-1 if (VT != MVT::i1) @@ -6371,7 +6676,6 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { } SDValue DAGCombiner::visitMSCATTER(SDNode *N) { - if (Level >= AfterLegalizeTypes) return SDValue(); @@ -6432,7 +6736,6 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) { } SDValue DAGCombiner::visitMSTORE(SDNode *N) { - if (Level >= AfterLegalizeTypes) return SDValue(); @@ -6447,7 +6750,6 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() == ISD::SETCC) { - // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), VT) != TargetLowering::TypeSplitVector) @@ -6504,11 +6806,10 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { } SDValue DAGCombiner::visitMGATHER(SDNode *N) { - if (Level >= AfterLegalizeTypes) return SDValue(); - MaskedGatherSDNode *MGT = dyn_cast<MaskedGatherSDNode>(N); + MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N); SDValue Mask = MGT->getMask(); SDLoc DL(N); @@ -6581,7 +6882,6 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) { } SDValue DAGCombiner::visitMLOAD(SDNode *N) { - if (Level >= AfterLegalizeTypes) return SDValue(); @@ -6593,7 +6893,6 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). - if (Mask.getOpcode() == ISD::SETCC) { EVT VT = N->getValueType(0); @@ -6665,6 +6964,57 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { return SDValue(); } +/// A vector select of 2 constant vectors can be simplified to math/logic to +/// avoid a variable select instruction and possibly avoid constant loads. +SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) { + SDValue Cond = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + EVT VT = N->getValueType(0); + if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 || + !TLI.convertSelectOfConstantsToMath(VT) || + !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) || + !ISD::isBuildVectorOfConstantSDNodes(N2.getNode())) + return SDValue(); + + // Check if we can use the condition value to increment/decrement a single + // constant value. This simplifies a select to an add and removes a constant + // load/materialization from the general case. + bool AllAddOne = true; + bool AllSubOne = true; + unsigned Elts = VT.getVectorNumElements(); + for (unsigned i = 0; i != Elts; ++i) { + SDValue N1Elt = N1.getOperand(i); + SDValue N2Elt = N2.getOperand(i); + if (N1Elt.isUndef() || N2Elt.isUndef()) + continue; + + const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue(); + const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue(); + if (C1 != C2 + 1) + AllAddOne = false; + if (C1 != C2 - 1) + AllSubOne = false; + } + + // Further simplifications for the extra-special cases where the constants are + // all 0 or all -1 should be implemented as folds of these patterns. + SDLoc DL(N); + if (AllAddOne || AllSubOne) { + // vselect <N x i1> Cond, C+1, C --> add (zext Cond), C + // vselect <N x i1> Cond, C-1, C --> add (sext Cond), C + auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; + SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond); + return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2); + } + + // The general case for select-of-constants: + // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2 + // ...but that only makes sense if a vselect is slower than 2 logic ops, so + // leave that to a machine-specific pass. + return SDValue(); +} + SDValue DAGCombiner::visitVSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -6729,6 +7079,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { return CV; } + if (SDValue V = foldVSelectOfConstants(N)) + return V; + return SDValue(); } @@ -7243,8 +7596,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); + bool NoReplaceTruncAnd = !N0.hasOneUse(); bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); CombineTo(N, And); + // If N0 has multiple uses, change other uses as well. + if (NoReplaceTruncAnd) { + SDValue TruncAnd = + DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); + CombineTo(N0.getNode(), TruncAnd); + } if (NoReplaceTrunc) DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); else @@ -7307,7 +7667,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true)) return SCC; - if (!VT.isVector()) { + if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) { EVT SetCCVT = getSetCCResultType(N00VT); // Don't do this transform for i1 because there's a select transform // that would reverse it. @@ -7399,20 +7759,6 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); } - // fold (zext (truncate (load x))) -> (zext (smaller load x)) - // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n))) - if (N0.getOpcode() == ISD::TRUNCATE) { - if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { - SDNode *oye = N0.getOperand(0).getNode(); - if (NarrowLoad.getNode() != N0.getNode()) { - CombineTo(N0.getNode(), NarrowLoad); - // CombineTo deleted the truncate, if needed, but not what's under it. - AddToWorklist(oye); - } - return SDValue(N, 0); // Return N so it doesn't get rechecked! - } - } - // fold (zext (truncate x)) -> (and x, mask) if (N0.getOpcode() == ISD::TRUNCATE) { // fold (zext (truncate (load x))) -> (zext (smaller load x)) @@ -7445,7 +7791,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); AddToWorklist(Op.getNode()); - return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); + SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); + // We may safely transfer the debug info describing the truncate node over + // to the equivalent and operation. + DAG.transferDbgValues(N0, And); + return And; } } @@ -7522,11 +7872,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (!N0.hasOneUse()) { if (N0.getOpcode() == ISD::AND) { auto *AndC = cast<ConstantSDNode>(N0.getOperand(1)); - auto NarrowLoad = false; EVT LoadResultTy = AndC->getValueType(0); - EVT ExtVT, LoadedVT; - if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT, LoadedVT, - NarrowLoad)) + EVT ExtVT; + if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT)) DoXform = false; } if (DoXform) @@ -7547,8 +7895,15 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND); + bool NoReplaceTruncAnd = !N0.hasOneUse(); bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); CombineTo(N, And); + // If N0 has multiple uses, change other uses as well. + if (NoReplaceTruncAnd) { + SDValue TruncAnd = + DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); + CombineTo(N0.getNode(), TruncAnd); + } if (NoReplaceTrunc) DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); else @@ -7604,10 +7959,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // If the desired elements are smaller or larger than the source // elements we can use a matching integer vector type and then // truncate/sign extend. - EVT MatchingElementType = EVT::getIntegerVT( - *DAG.getContext(), N00VT.getScalarSizeInBits()); - EVT MatchingVectorType = EVT::getVectorVT( - *DAG.getContext(), MatchingElementType, N00VT.getVectorNumElements()); + EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); SDValue VsetCC = DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0), N0.getOperand(1), N0.getOperand(2)); @@ -7731,7 +8083,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { ISD::ANY_EXTEND); // If the load value is used only by N, replace it via CombineTo N. bool NoReplaceTrunc = N0.hasOneUse(); - CombineTo(N, ExtLoad); + CombineTo(N, ExtLoad); if (NoReplaceTrunc) DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); else @@ -7769,13 +8121,16 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { // aext(setcc) -> aext(vsetcc) // Only do this before legalize for now. if (VT.isVector() && !LegalOperations) { - EVT N0VT = N0.getOperand(0).getValueType(); - // We know that the # elements of the results is the same as the - // # elements of the compare (and the # elements of the compare result - // for that matter). Check to see that they are the same size. If so, - // we know that the element size of the sext'd result matches the - // element size of the compare operands. - if (VT.getSizeInBits() == N0VT.getSizeInBits()) + EVT N00VT = N0.getOperand(0).getValueType(); + if (getSetCCResultType(N00VT) == N0.getValueType()) + return SDValue(); + + // We know that the # elements of the results is the same as the + // # elements of the compare (and the # elements of the compare result + // for that matter). Check to see that they are the same size. If so, + // we know that the element size of the sext'd result matches the + // element size of the compare operands. + if (VT.getSizeInBits() == N00VT.getSizeInBits()) return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), cast<CondCodeSDNode>(N0.getOperand(2))->get()); @@ -7783,7 +8138,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { // elements we can use a matching integer vector type and then // truncate/any extend else { - EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger(); + EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0), N0.getOperand(1), @@ -7804,77 +8159,47 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { return SDValue(); } -SDValue DAGCombiner::visitAssertZext(SDNode *N) { +SDValue DAGCombiner::visitAssertExt(SDNode *N) { + unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - EVT EVT = cast<VTSDNode>(N1)->getVT(); + EVT AssertVT = cast<VTSDNode>(N1)->getVT(); - // fold (assertzext (assertzext x, vt), vt) -> (assertzext x, vt) - if (N0.getOpcode() == ISD::AssertZext && - EVT == cast<VTSDNode>(N0.getOperand(1))->getVT()) + // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt) + if (N0.getOpcode() == Opcode && + AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT()) return N0; - return SDValue(); -} + if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && + N0.getOperand(0).getOpcode() == Opcode) { + // We have an assert, truncate, assert sandwich. Make one stronger assert + // by asserting on the smallest asserted type to the larger source type. + // This eliminates the later assert: + // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN + // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN + SDValue BigA = N0.getOperand(0); + EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT(); + assert(BigA_AssertVT.bitsLE(N0.getValueType()) && + "Asserting zero/sign-extended bits to a type larger than the " + "truncated destination does not provide information"); -/// See if the specified operand can be simplified with the knowledge that only -/// the bits specified by Mask are used. If so, return the simpler operand, -/// otherwise return a null SDValue. -/// -/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can -/// simplify nodes with multiple uses more aggressively.) -SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) { - switch (V.getOpcode()) { - default: break; - case ISD::Constant: { - const ConstantSDNode *CV = cast<ConstantSDNode>(V.getNode()); - assert(CV && "Const value should be ConstSDNode."); - const APInt &CVal = CV->getAPIntValue(); - APInt NewVal = CVal & Mask; - if (NewVal != CVal) - return DAG.getConstant(NewVal, SDLoc(V), V.getValueType()); - break; + SDLoc DL(N); + EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT; + SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT); + SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), + BigA.getOperand(0), MinAssertVTVal); + return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); } - case ISD::OR: - case ISD::XOR: - // If the LHS or RHS don't contribute bits to the or, drop them. - if (DAG.MaskedValueIsZero(V.getOperand(0), Mask)) - return V.getOperand(1); - if (DAG.MaskedValueIsZero(V.getOperand(1), Mask)) - return V.getOperand(0); - break; - case ISD::SRL: - // Only look at single-use SRLs. - if (!V.getNode()->hasOneUse()) - break; - if (ConstantSDNode *RHSC = getAsNonOpaqueConstant(V.getOperand(1))) { - // See if we can recursively simplify the LHS. - unsigned Amt = RHSC->getZExtValue(); - // Watch out for shift count overflow though. - if (Amt >= Mask.getBitWidth()) break; - APInt NewMask = Mask << Amt; - if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask)) - return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(), - SimplifyLHS, V.getOperand(1)); - } - break; - case ISD::AND: { - // X & -1 -> X (ignoring bits which aren't demanded). - ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1)); - if (AndVal && (AndVal->getAPIntValue() & Mask) == Mask) - return V.getOperand(0); - break; - } - } return SDValue(); } /// If the result of a wider load is shifted to right of N bits and then /// truncated to a narrower type and where N is a multiple of number of bits of /// the narrower type, transform it to a narrower load from address + N / num of -/// bits of new type. If the result is to be extended, also fold the extension -/// to form a extending load. +/// bits of new type. Also narrow the load if the result is masked with an AND +/// to effectively produce a smaller type. If the result is to be extended, also +/// fold the extension to form a extending load. SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { unsigned Opc = N->getOpcode(); @@ -7893,28 +8218,40 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { ExtType = ISD::SEXTLOAD; ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT(); } else if (Opc == ISD::SRL) { - // Another special-case: SRL is basically zero-extending a narrower value. + // Another special-case: SRL is basically zero-extending a narrower value, + // or it maybe shifting a higher subword, half or byte into the lowest + // bits. ExtType = ISD::ZEXTLOAD; N0 = SDValue(N, 0); - ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); - if (!N01) return SDValue(); - ExtVT = EVT::getIntegerVT(*DAG.getContext(), - VT.getSizeInBits() - N01->getZExtValue()); - } - if (LegalOperations && !TLI.isLoadExtLegal(ExtType, VT, ExtVT)) - return SDValue(); - unsigned EVTBits = ExtVT.getSizeInBits(); + auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0)); + auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (!N01 || !LN0) + return SDValue(); - // Do not generate loads of non-round integer types since these can - // be expensive (and would be wrong if the type is not byte sized). - if (!ExtVT.isRound()) - return SDValue(); + uint64_t ShiftAmt = N01->getZExtValue(); + uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits(); + if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt) + ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt); + else + ExtVT = EVT::getIntegerVT(*DAG.getContext(), + VT.getSizeInBits() - ShiftAmt); + } else if (Opc == ISD::AND) { + // An AND with a constant mask is the same as a truncate + zero-extend. + auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!AndC || !AndC->getAPIntValue().isMask()) + return SDValue(); + + unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); + ExtType = ISD::ZEXTLOAD; + ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); + } unsigned ShAmt = 0; if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { ShAmt = N01->getZExtValue(); + unsigned EVTBits = ExtVT.getSizeInBits(); // Is the shift amount a multiple of size of VT? if ((ShAmt & (EVTBits-1)) == 0) { N0 = N0.getOperand(0); @@ -7951,42 +8288,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { } } - // If we haven't found a load, we can't narrow it. Don't transform one with - // multiple uses, this would require adding a new load. - if (!isa<LoadSDNode>(N0) || !N0.hasOneUse()) + // If we haven't found a load, we can't narrow it. + if (!isa<LoadSDNode>(N0)) return SDValue(); - // Don't change the width of a volatile load. LoadSDNode *LN0 = cast<LoadSDNode>(N0); - if (LN0->isVolatile()) - return SDValue(); - - // Verify that we are actually reducing a load width here. - if (LN0->getMemoryVT().getSizeInBits() < EVTBits) - return SDValue(); - - // For the transform to be legal, the load must produce only two values - // (the value loaded and the chain). Don't transform a pre-increment - // load, for example, which produces an extra value. Otherwise the - // transformation is not equivalent, and the downstream logic to replace - // uses gets things wrong. - if (LN0->getNumValues() > 2) - return SDValue(); - - // If the load that we're shrinking is an extload and we're not just - // discarding the extension we can't simply shrink the load. Bail. - // TODO: It would be possible to merge the extensions in some cases. - if (LN0->getExtensionType() != ISD::NON_EXTLOAD && - LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt) - return SDValue(); - - if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT)) - return SDValue(); - - EVT PtrType = N0.getOperand(1).getValueType(); - - if (PtrType == MVT::Untyped || PtrType.isExtended()) - // It's not possible to generate a constant of extended or untyped type. + if (!isLegalNarrowLoad(LN0, ExtType, ExtVT, ShAmt)) return SDValue(); // For big endian targets, we need to adjust the offset to the pointer to @@ -7997,6 +8304,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { ShAmt = LVTStoreBits - EVTStoreBits - ShAmt; } + EVT PtrType = N0.getOperand(1).getValueType(); uint64_t PtrOff = ShAmt / 8; unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); SDLoc DL(LN0); @@ -8130,10 +8438,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { } // fold (sext_inreg (extload x)) -> (sextload x) + // If sextload is not supported by target, we can only do the combine when + // load has one use. Doing otherwise can block folding the extload with other + // extends that the target does support. if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && EVT == cast<LoadSDNode>(N0)->getMemoryVT() && - ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) || + ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile() && + N0.hasOneUse()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, @@ -8208,12 +8520,18 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // noop truncate if (N0.getValueType() == N->getValueType(0)) return N0; - // fold (truncate c1) -> c1 - if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); + // fold (truncate (truncate x)) -> (truncate x) if (N0.getOpcode() == ISD::TRUNCATE) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); + + // fold (truncate c1) -> c1 + if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { + SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); + if (C.getNode() != N) + return C; + } + // fold (truncate (ext x)) -> (ext x) or (truncate x) or x if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND || @@ -8245,7 +8563,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // we need to be more careful about the vector instructions that we generate. if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) { - EVT VecTy = N0.getOperand(0).getValueType(); EVT ExTy = N0.getValueType(); EVT TrTy = N->getValueType(0); @@ -8311,7 +8628,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && N0.getOperand(0).hasOneUse()) { - SDValue BuildVect = N0.getOperand(0); EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType(); EVT TruncVecEltTy = VT.getVectorElementType(); @@ -8340,9 +8656,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // Currently we only perform this optimization on scalars because vectors // may have different active low bits. if (!VT.isVector()) { - if (SDValue Shorter = - GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(), - VT.getSizeInBits()))) + APInt Mask = + APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits()); + if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask)) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); } @@ -8413,7 +8729,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // Fold truncate of a bitcast of a vector to an extract of the low vector // element. // - // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, 0 + // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) { SDValue VecSrc = N0.getOperand(0); EVT SrcVT = VecSrc.getValueType(); @@ -8423,8 +8739,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDLoc SL(N); EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + unsigned Idx = isLE ? 0 : SrcVT.getVectorNumElements() - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, - VecSrc, DAG.getConstant(0, SL, IdxVT)); + VecSrc, DAG.getConstant(Idx, SL, IdxVT)); } } @@ -8466,11 +8783,18 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) { LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0)); LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1)); + + // A BUILD_PAIR is always having the least significant part in elt 0 and the + // most significant part in elt 1. So when combining into one large load, we + // need to consider the endianness. + if (DAG.getDataLayout().isBigEndian()) + std::swap(LD1, LD2); + if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() || LD1->getAddressSpace() != LD2->getAddressSpace()) return SDValue(); EVT LD1VT = LD1->getValueType(0); - unsigned LD1Bytes = LD1VT.getSizeInBits() / 8; + unsigned LD1Bytes = LD1VT.getStoreSize(); if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() && DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) { unsigned Align = LD1->getAlignment(); @@ -8751,12 +9075,15 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT) return SDValue(Op.getOperand(0)); - if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || + if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) return DAG.getBitcast(VT, Op); return SDValue(); }; + // FIXME: If either input vector is bitcast, try to convert the shuffle to + // the result type of this bitcast. This would eliminate at least one + // bitcast. See the transform in InstCombine. SDValue SV0 = PeekThroughBitcast(N0->getOperand(0)); SDValue SV1 = PeekThroughBitcast(N0->getOperand(1)); if (!(SV0 && SV1)) @@ -8949,7 +9276,6 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); - bool LookThroughFPExt = TLI.isFPExtFree(VT); // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. @@ -8979,28 +9305,31 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { } // Look through FP_EXTEND nodes to do more combining. - if (LookThroughFPExt) { - // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) - if (N0.getOpcode() == ISD::FP_EXTEND) { - SDValue N00 = N0.getOperand(0); - if (isContractableFMUL(N00)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(1)), N1); + + // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (isContractableFMUL(N00) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(1)), N1); } + } - // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) - // Note: Commutes FADD operands. - if (N1.getOpcode() == ISD::FP_EXTEND) { - SDValue N10 = N1.getOperand(0); - if (isContractableFMUL(N10)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(1)), N0); + // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) + // Note: Commutes FADD operands. + if (N1.getOpcode() == ISD::FP_EXTEND) { + SDValue N10 = N1.getOperand(0); + if (isContractableFMUL(N10) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(1)), N0); } } @@ -9036,80 +9365,87 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { N0)); } - if (LookThroughFPExt) { - // fold (fadd (fma x, y, (fpext (fmul u, v))), z) - // -> (fma x, y, (fma (fpext u), (fpext v), z)) - auto FoldFAddFMAFPExtFMul = [&] ( - SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, U), - DAG.getNode(ISD::FP_EXTEND, SL, VT, V), - Z)); - }; - if (N0.getOpcode() == PreferredFusedOpcode) { - SDValue N02 = N0.getOperand(2); - if (N02.getOpcode() == ISD::FP_EXTEND) { - SDValue N020 = N02.getOperand(0); - if (isContractableFMUL(N020)) - return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), - N020.getOperand(0), N020.getOperand(1), - N1); + + // fold (fadd (fma x, y, (fpext (fmul u, v))), z) + // -> (fma x, y, (fma (fpext u), (fpext v), z)) + auto FoldFAddFMAFPExtFMul = [&] ( + SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { + return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, U), + DAG.getNode(ISD::FP_EXTEND, SL, VT, V), + Z)); + }; + if (N0.getOpcode() == PreferredFusedOpcode) { + SDValue N02 = N0.getOperand(2); + if (N02.getOpcode() == ISD::FP_EXTEND) { + SDValue N020 = N02.getOperand(0); + if (isContractableFMUL(N020) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { + return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), + N020.getOperand(0), N020.getOperand(1), + N1); } } + } - // fold (fadd (fpext (fma x, y, (fmul u, v))), z) - // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) - // FIXME: This turns two single-precision and one double-precision - // operation into two double-precision operations, which might not be - // interesting for all targets, especially GPUs. - auto FoldFAddFPExtFMAFMul = [&] ( - SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, X), - DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, U), - DAG.getNode(ISD::FP_EXTEND, SL, VT, V), - Z)); - }; - if (N0.getOpcode() == ISD::FP_EXTEND) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == PreferredFusedOpcode) { - SDValue N002 = N00.getOperand(2); - if (isContractableFMUL(N002)) - return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), - N002.getOperand(0), N002.getOperand(1), - N1); + // fold (fadd (fpext (fma x, y, (fmul u, v))), z) + // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) + // FIXME: This turns two single-precision and one double-precision + // operation into two double-precision operations, which might not be + // interesting for all targets, especially GPUs. + auto FoldFAddFPExtFMAFMul = [&] ( + SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, X), + DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, U), + DAG.getNode(ISD::FP_EXTEND, SL, VT, V), + Z)); + }; + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == PreferredFusedOpcode) { + SDValue N002 = N00.getOperand(2); + if (isContractableFMUL(N002) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { + return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), + N002.getOperand(0), N002.getOperand(1), + N1); } } + } - // fold (fadd x, (fma y, z, (fpext (fmul u, v))) - // -> (fma y, z, (fma (fpext u), (fpext v), x)) - if (N1.getOpcode() == PreferredFusedOpcode) { - SDValue N12 = N1.getOperand(2); - if (N12.getOpcode() == ISD::FP_EXTEND) { - SDValue N120 = N12.getOperand(0); - if (isContractableFMUL(N120)) - return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), - N120.getOperand(0), N120.getOperand(1), - N0); + // fold (fadd x, (fma y, z, (fpext (fmul u, v))) + // -> (fma y, z, (fma (fpext u), (fpext v), x)) + if (N1.getOpcode() == PreferredFusedOpcode) { + SDValue N12 = N1.getOperand(2); + if (N12.getOpcode() == ISD::FP_EXTEND) { + SDValue N120 = N12.getOperand(0); + if (isContractableFMUL(N120) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { + return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), + N120.getOperand(0), N120.getOperand(1), + N0); } } + } - // fold (fadd x, (fpext (fma y, z, (fmul u, v))) - // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) - // FIXME: This turns two single-precision and one double-precision - // operation into two double-precision operations, which might not be - // interesting for all targets, especially GPUs. - if (N1.getOpcode() == ISD::FP_EXTEND) { - SDValue N10 = N1.getOperand(0); - if (N10.getOpcode() == PreferredFusedOpcode) { - SDValue N102 = N10.getOperand(2); - if (isContractableFMUL(N102)) - return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), - N102.getOperand(0), N102.getOperand(1), - N0); + // fold (fadd x, (fpext (fma y, z, (fmul u, v))) + // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) + // FIXME: This turns two single-precision and one double-precision + // operation into two double-precision operations, which might not be + // interesting for all targets, especially GPUs. + if (N1.getOpcode() == ISD::FP_EXTEND) { + SDValue N10 = N1.getOperand(0); + if (N10.getOpcode() == PreferredFusedOpcode) { + SDValue N102 = N10.getOperand(2); + if (isContractableFMUL(N102) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { + return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), + N102.getOperand(0), N102.getOperand(1), + N0); } } } @@ -9151,7 +9487,6 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); - bool LookThroughFPExt = TLI.isFPExtFree(VT); // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. @@ -9187,79 +9522,83 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { } // Look through FP_EXTEND nodes to do more combining. - if (LookThroughFPExt) { - // fold (fsub (fpext (fmul x, y)), z) - // -> (fma (fpext x), (fpext y), (fneg z)) - if (N0.getOpcode() == ISD::FP_EXTEND) { - SDValue N00 = N0.getOperand(0); - if (isContractableFMUL(N00)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, N1)); + + // fold (fsub (fpext (fmul x, y)), z) + // -> (fma (fpext x), (fpext y), (fneg z)) + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (isContractableFMUL(N00) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, N1)); } + } - // fold (fsub x, (fpext (fmul y, z))) - // -> (fma (fneg (fpext y)), (fpext z), x) - // Note: Commutes FSUB operands. - if (N1.getOpcode() == ISD::FP_EXTEND) { - SDValue N10 = N1.getOperand(0); - if (isContractableFMUL(N10)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, + // fold (fsub x, (fpext (fmul y, z))) + // -> (fma (fneg (fpext y)), (fpext z), x) + // Note: Commutes FSUB operands. + if (N1.getOpcode() == ISD::FP_EXTEND) { + SDValue N10 = N1.getOperand(0); + if (isContractableFMUL(N10) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(0))), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N10.getOperand(1)), + N0); + } + } + + // fold (fsub (fpext (fneg (fmul, x, y))), z) + // -> (fneg (fma (fpext x), (fpext y), z)) + // Note: This could be removed with appropriate canonicalization of the + // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the + // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent + // from implementing the canonicalization in visitFSUB. + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::FNEG) { + SDValue N000 = N00.getOperand(0); + if (isContractableFMUL(N000) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { + return DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(0))), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(1)), - N0); - } - - // fold (fsub (fpext (fneg (fmul, x, y))), z) - // -> (fneg (fma (fpext x), (fpext y), z)) - // Note: This could be removed with appropriate canonicalization of the - // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the - // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent - // from implementing the canonicalization in visitFSUB. - if (N0.getOpcode() == ISD::FP_EXTEND) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FNEG) { - SDValue N000 = N00.getOperand(0); - if (isContractableFMUL(N000)) { - return DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N000.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N000.getOperand(1)), - N1)); - } + N000.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N000.getOperand(1)), + N1)); } } + } - // fold (fsub (fneg (fpext (fmul, x, y))), z) - // -> (fneg (fma (fpext x)), (fpext y), z) - // Note: This could be removed with appropriate canonicalization of the - // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the - // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent - // from implementing the canonicalization in visitFSUB. - if (N0.getOpcode() == ISD::FNEG) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FP_EXTEND) { - SDValue N000 = N00.getOperand(0); - if (isContractableFMUL(N000)) { - return DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N000.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N000.getOperand(1)), - N1)); - } + // fold (fsub (fneg (fpext (fmul, x, y))), z) + // -> (fneg (fma (fpext x)), (fpext y), z) + // Note: This could be removed with appropriate canonicalization of the + // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the + // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent + // from implementing the canonicalization in visitFSUB. + if (N0.getOpcode() == ISD::FNEG) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::FP_EXTEND) { + SDValue N000 = N00.getOperand(0); + if (isContractableFMUL(N000) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N000.getValueType())) { + return DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N000.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N000.getOperand(1)), + N1)); } } - } // More folding opportunities when target permits. @@ -9298,102 +9637,108 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { N21, N0)); } - if (LookThroughFPExt) { - // fold (fsub (fma x, y, (fpext (fmul u, v))), z) - // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) - if (N0.getOpcode() == PreferredFusedOpcode) { - SDValue N02 = N0.getOperand(2); - if (N02.getOpcode() == ISD::FP_EXTEND) { - SDValue N020 = N02.getOperand(0); - if (isContractableFMUL(N020)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N020.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N020.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, - N1))); - } - } - // fold (fsub (fpext (fma x, y, (fmul u, v))), z) - // -> (fma (fpext x), (fpext y), - // (fma (fpext u), (fpext v), (fneg z))) - // FIXME: This turns two single-precision and one double-precision - // operation into two double-precision operations, which might not be - // interesting for all targets, especially GPUs. - if (N0.getOpcode() == ISD::FP_EXTEND) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == PreferredFusedOpcode) { - SDValue N002 = N00.getOperand(2); - if (isContractableFMUL(N002)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(1)), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N002.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N002.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, - N1))); - } - } - - // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) - // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) - if (N1.getOpcode() == PreferredFusedOpcode && - N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { - SDValue N120 = N1.getOperand(2).getOperand(0); - if (isContractableFMUL(N120)) { - SDValue N1200 = N120.getOperand(0); - SDValue N1201 = N120.getOperand(1); + // fold (fsub (fma x, y, (fpext (fmul u, v))), z) + // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) + if (N0.getOpcode() == PreferredFusedOpcode) { + SDValue N02 = N0.getOperand(2); + if (N02.getOpcode() == ISD::FP_EXTEND) { + SDValue N020 = N02.getOperand(0); + if (isContractableFMUL(N020) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), - N1.getOperand(1), + N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, - VT, N1200)), DAG.getNode(ISD::FP_EXTEND, SL, VT, - N1201), - N0)); + N020.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N020.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, + N1))); } } + } - // fold (fsub x, (fpext (fma y, z, (fmul u, v)))) - // -> (fma (fneg (fpext y)), (fpext z), - // (fma (fneg (fpext u)), (fpext v), x)) - // FIXME: This turns two single-precision and one double-precision - // operation into two double-precision operations, which might not be - // interesting for all targets, especially GPUs. - if (N1.getOpcode() == ISD::FP_EXTEND && - N1.getOperand(0).getOpcode() == PreferredFusedOpcode) { - SDValue N100 = N1.getOperand(0).getOperand(0); - SDValue N101 = N1.getOperand(0).getOperand(1); - SDValue N102 = N1.getOperand(0).getOperand(2); - if (isContractableFMUL(N102)) { - SDValue N1020 = N102.getOperand(0); - SDValue N1021 = N102.getOperand(1); + // fold (fsub (fpext (fma x, y, (fmul u, v))), z) + // -> (fma (fpext x), (fpext y), + // (fma (fpext u), (fpext v), (fneg z))) + // FIXME: This turns two single-precision and one double-precision + // operation into two double-precision operations, which might not be + // interesting for all targets, especially GPUs. + if (N0.getOpcode() == ISD::FP_EXTEND) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == PreferredFusedOpcode) { + SDValue N002 = N00.getOperand(2); + if (isContractableFMUL(N002) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N100)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N00.getOperand(1)), DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, - VT, N1020)), DAG.getNode(ISD::FP_EXTEND, SL, VT, - N1021), - N0)); + N002.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N002.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, + N1))); } } } + + // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) + // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) + if (N1.getOpcode() == PreferredFusedOpcode && + N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { + SDValue N120 = N1.getOperand(2).getOperand(0); + if (isContractableFMUL(N120) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { + SDValue N1200 = N120.getOperand(0); + SDValue N1201 = N120.getOperand(1); + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), + N1.getOperand(1), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, + VT, N1200)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N1201), + N0)); + } + } + + // fold (fsub x, (fpext (fma y, z, (fmul u, v)))) + // -> (fma (fneg (fpext y)), (fpext z), + // (fma (fneg (fpext u)), (fpext v), x)) + // FIXME: This turns two single-precision and one double-precision + // operation into two double-precision operations, which might not be + // interesting for all targets, especially GPUs. + if (N1.getOpcode() == ISD::FP_EXTEND && + N1.getOperand(0).getOpcode() == PreferredFusedOpcode) { + SDValue CvtSrc = N1.getOperand(0); + SDValue N100 = CvtSrc.getOperand(0); + SDValue N101 = CvtSrc.getOperand(1); + SDValue N102 = CvtSrc.getOperand(2); + if (isContractableFMUL(N102) && + TLI.isFPExtFoldable(PreferredFusedOpcode, VT, CvtSrc.getValueType())) { + SDValue N1020 = N102.getOperand(0); + SDValue N1021 = N102.getOperand(1); + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N100)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, + VT, N1020)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, + N1021), + N0)); + } + } } return SDValue(); @@ -9959,6 +10304,14 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { // TODO: The FMA node should have flags that propagate to this node. return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg); } + + // fma (fneg x), K, y -> fma x -K, y + if (N0.getOpcode() == ISD::FNEG && + (TLI.isOperationLegal(ISD::ConstantFP, VT) || + (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) { + return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), + DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2); + } } if (Options.UnsafeFPMath) { @@ -10081,8 +10434,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { (!LegalOperations || // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM // backend)... we should handle this gracefully after Legalize. - // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) || - TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) || + // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) || + TLI.isOperationLegal(ISD::ConstantFP, VT) || TLI.isFPImmLegal(Recip, VT))) return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getConstantFP(Recip, DL, VT), Flags); @@ -10264,7 +10617,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && // ...but only if the target supports immediate floating-point values (!LegalOperations || - TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) + TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); // If the input is a legal type, and SINT_TO_FP is not legal on this target, @@ -10282,7 +10635,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 && !VT.isVector() && (!LegalOperations || - TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { + TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { SDLoc DL(N); SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1), @@ -10296,7 +10649,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() && (!LegalOperations || - TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { + TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { SDLoc DL(N); SDValue Ops[] = { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1), @@ -10318,7 +10671,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && // ...but only if the target supports immediate floating-point values (!LegalOperations || - TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) + TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); // If the input is a legal type, and UINT_TO_FP is not legal on this target, @@ -10333,10 +10686,9 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { // The next optimizations are desirable only if SELECT_CC can be lowered. if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) { // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) - if (N0.getOpcode() == ISD::SETCC && !VT.isVector() && (!LegalOperations || - TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) { + TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { SDLoc DL(N); SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1), @@ -10557,6 +10909,19 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) { if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0); + // fold ftrunc (known rounded int x) -> x + // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is + // likely to be generated to extract integer from a rounded floating value. + switch (N0.getOpcode()) { + default: break; + case ISD::FRINT: + case ISD::FTRUNC: + case ISD::FNEARBYINT: + case ISD::FFLOOR: + case ISD::FCEIL: + return N0; + } + return SDValue(); } @@ -11160,6 +11525,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { // Replace the uses of Ptr with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0)); deleteAndRecombine(Ptr.getNode()); + AddToWorklist(Result.getNode()); return true; } @@ -11445,6 +11811,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { } namespace { + /// \brief Helper structure used to slice a load in smaller loads. /// Basically a slice is obtained from the following sequence: /// Origin = load Ty1, Base @@ -11462,21 +11829,19 @@ struct LoadedSlice { struct Cost { /// Are we optimizing for code size. bool ForCodeSize; + /// Various cost. - unsigned Loads; - unsigned Truncates; - unsigned CrossRegisterBanksCopies; - unsigned ZExts; - unsigned Shift; + unsigned Loads = 0; + unsigned Truncates = 0; + unsigned CrossRegisterBanksCopies = 0; + unsigned ZExts = 0; + unsigned Shift = 0; - Cost(bool ForCodeSize = false) - : ForCodeSize(ForCodeSize), Loads(0), Truncates(0), - CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {} + Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {} /// \brief Get the cost of one isolated slice. Cost(const LoadedSlice &LS, bool ForCodeSize = false) - : ForCodeSize(ForCodeSize), Loads(1), Truncates(0), - CrossRegisterBanksCopies(0), ZExts(0), Shift(0) { + : ForCodeSize(ForCodeSize), Loads(1) { EVT TruncType = LS.Inst->getValueType(0); EVT LoadedType = LS.getLoadedType(); if (TruncType != LoadedType && @@ -11538,13 +11903,17 @@ struct LoadedSlice { bool operator>=(const Cost &RHS) const { return !(*this < RHS); } }; + // The last instruction that represent the slice. This should be a // truncate instruction. SDNode *Inst; + // The original load instruction. LoadSDNode *Origin; + // The right shift amount in bits from the original load. unsigned Shift; + // The DAG from which Origin came from. // This is used to get some contextual information about legal types, etc. SelectionDAG *DAG; @@ -11746,7 +12115,8 @@ struct LoadedSlice { return true; } }; -} + +} // end anonymous namespace /// \brief Check that all bits set in \p UsedBits form a dense region, i.e., /// \p UsedBits looks like 0..0 1..1 0..0. @@ -11804,7 +12174,6 @@ static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices, for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice, // Set the beginning of the pair. First = Second) { - Second = &LoadedSlices[CurrSlice]; // If First is NULL, it means we start a new pair. @@ -11935,7 +12304,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) { // will be across several bytes. We do not support that. unsigned Width = User->getValueSizeInBits(0); if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7)) - return 0; + return false; // Build the slice for this chain of computations. LoadedSlice LS(User, LD, Shift, &DAG); @@ -12060,7 +12429,6 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { return Result; } - /// Check to see if IVal is something that provides a value as specified by /// MaskInfo. If so, replace the specified store with a narrower store of /// truncated IVal. @@ -12121,7 +12489,6 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo, .getNode(); } - /// Look for sequence of load / op / store where op is one of 'or', 'xor', and /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try /// narrowing the load and store if it would end up being a win for performance @@ -12325,7 +12692,6 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, // Walk all the users of the constant with which we're multiplying. for (SDNode *Use : ConstNode->uses()) { - if (Use == MulNode) // This use is the one we're on right now. Skip it. continue; @@ -12376,6 +12742,12 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, return false; } +static SDValue peekThroughBitcast(SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + return V; +} + SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) { SmallVector<SDValue, 8> Chains; @@ -12403,56 +12775,93 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( if (NumStores < 2) return false; - int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; - // The latest Node in the DAG. SDLoc DL(StoreNodes[0].MemNode); - SDValue StoredVal; + int64_t ElementSizeBits = MemVT.getStoreSizeInBits(); + unsigned SizeInBits = NumStores * ElementSizeBits; + unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; + + EVT StoreTy; if (UseVector) { - bool IsVec = MemVT.isVector(); - unsigned Elts = NumStores; - if (IsVec) { - // When merging vector stores, get the total number of elements. - Elts *= MemVT.getVectorNumElements(); - } + unsigned Elts = NumStores * NumMemElts; // Get the type for the merged vector store. - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); - assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); + StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); + } else + StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); + SDValue StoredVal; + if (UseVector) { if (IsConstantSrc) { SmallVector<SDValue, 8> BuildVector; - for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { + for (unsigned I = 0; I != NumStores; ++I) { StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode); SDValue Val = St->getValue(); - if (MemVT.getScalarType().isInteger()) - if (auto *CFP = dyn_cast<ConstantFPSDNode>(St->getValue())) - Val = DAG.getConstant( - (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(), - SDLoc(CFP), MemVT); + // If constant is of the wrong type, convert it now. + if (MemVT != Val.getValueType()) { + Val = peekThroughBitcast(Val); + // Deal with constants of wrong size. + if (ElementSizeBits != Val.getValueSizeInBits()) { + EVT IntMemVT = + EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); + if (isa<ConstantFPSDNode>(Val)) { + // Not clear how to truncate FP values. + return false; + } else if (auto *C = dyn_cast<ConstantSDNode>(Val)) + Val = DAG.getConstant(C->getAPIntValue() + .zextOrTrunc(Val.getValueSizeInBits()) + .zextOrTrunc(ElementSizeBits), + SDLoc(C), IntMemVT); + } + // Make sure correctly size type is the correct type. + Val = DAG.getBitcast(MemVT, Val); + } BuildVector.push_back(Val); } - StoredVal = DAG.getBuildVector(Ty, DL, BuildVector); + StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS + : ISD::BUILD_VECTOR, + DL, StoreTy, BuildVector); } else { SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < NumStores; ++i) { StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - SDValue Val = St->getValue(); - // All operands of BUILD_VECTOR / CONCAT_VECTOR must have the same type. - if (Val.getValueType() != MemVT) - return false; + SDValue Val = peekThroughBitcast(St->getValue()); + // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of + // type MemVT. If the underlying value is not the correct + // type, but it is an extraction of an appropriate vector we + // can recast Val to be of the correct type. This may require + // converting between EXTRACT_VECTOR_ELT and + // EXTRACT_SUBVECTOR. + if ((MemVT != Val.getValueType()) && + (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) { + SDValue Vec = Val.getOperand(0); + EVT MemVTScalarTy = MemVT.getScalarType(); + // We may need to add a bitcast here to get types to line up. + if (MemVTScalarTy != Vec.getValueType()) { + unsigned Elts = Vec.getValueType().getSizeInBits() / + MemVTScalarTy.getSizeInBits(); + EVT NewVecTy = + EVT::getVectorVT(*DAG.getContext(), MemVTScalarTy, Elts); + Vec = DAG.getBitcast(NewVecTy, Vec); + } + auto OpC = (MemVT.isVector()) ? ISD::EXTRACT_SUBVECTOR + : ISD::EXTRACT_VECTOR_ELT; + Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Val.getOperand(1)); + } Ops.push_back(Val); } // Build the extracted vector elements back into a vector. - StoredVal = DAG.getNode(IsVec ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, - DL, Ty, Ops); } + StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS + : ISD::BUILD_VECTOR, + DL, StoreTy, Ops); + } } else { // We should always use a vector store when merging extracted vector // elements, so this path implies a store of constants. assert(IsConstantSrc && "Merged vector elements should use vector store"); - unsigned SizeInBits = NumStores * ElementSizeBytes * 8; APInt StoreInt(SizeInBits, 0); // Construct a single integer constant which is made of the smaller @@ -12463,18 +12872,25 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode); SDValue Val = St->getValue(); - StoreInt <<= ElementSizeBytes * 8; + StoreInt <<= ElementSizeBits; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) { - StoreInt |= C->getAPIntValue().zextOrTrunc(SizeInBits); + StoreInt |= C->getAPIntValue() + .zextOrTrunc(ElementSizeBits) + .zextOrTrunc(SizeInBits); } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) { - StoreInt |= C->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits); + StoreInt |= C->getValueAPF() + .bitcastToAPInt() + .zextOrTrunc(ElementSizeBits) + .zextOrTrunc(SizeInBits); + // If fp truncation is necessary give up for now. + if (MemVT.getSizeInBits() != ElementSizeBits) + return false; } else { llvm_unreachable("Invalid constant element type"); } } // Create the new Load and Store operations. - EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); } @@ -12483,7 +12899,7 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( // make sure we use trunc store if it's necessary to be legal. SDValue NewStore; - if (UseVector || !UseTrunc) { + if (!UseTrunc) { NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), FirstInChain->getAlignment()); @@ -12517,6 +12933,7 @@ void DAGCombiner::getStoreMergeCandidates( BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); EVT MemVT = St->getMemoryVT(); + SDValue Val = peekThroughBitcast(St->getValue()); // We must have a base and an offset. if (!BasePtr.getBase().getNode()) return; @@ -12525,47 +12942,62 @@ void DAGCombiner::getStoreMergeCandidates( if (BasePtr.getBase().isUndef()) return; - bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) || - isa<ConstantFPSDNode>(St->getValue()); - bool IsExtractVecSrc = - (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || - St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR); - bool IsLoadSrc = isa<LoadSDNode>(St->getValue()); + bool IsConstantSrc = isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val); + bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Val.getOpcode() == ISD::EXTRACT_SUBVECTOR); + bool IsLoadSrc = isa<LoadSDNode>(Val); BaseIndexOffset LBasePtr; // Match on loadbaseptr if relevant. - if (IsLoadSrc) - LBasePtr = BaseIndexOffset::match( - cast<LoadSDNode>(St->getValue())->getBasePtr(), DAG); - + EVT LoadVT; + if (IsLoadSrc) { + auto *Ld = cast<LoadSDNode>(Val); + LBasePtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); + LoadVT = Ld->getMemoryVT(); + // Load and store should be the same type. + if (MemVT != LoadVT) + return; + } auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { if (Other->isVolatile() || Other->isIndexed()) return false; - // We can merge constant floats to equivalent integers - if (Other->getMemoryVT() != MemVT) - if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) && - isa<ConstantFPSDNode>(Other->getValue()))) - return false; + SDValue Val = peekThroughBitcast(Other->getValue()); + // Allow merging constants of different types as integers. + bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) + : Other->getMemoryVT() != MemVT; if (IsLoadSrc) { + if (NoTypeMatch) + return false; // The Load's Base Ptr must also match - if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Other->getValue())) { + if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Val)) { auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG); + if (LoadVT != OtherLd->getMemoryVT()) + return false; if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) return false; } else return false; } - if (IsConstantSrc) - if (!(isa<ConstantSDNode>(Other->getValue()) || - isa<ConstantFPSDNode>(Other->getValue()))) + if (IsConstantSrc) { + if (NoTypeMatch) return false; - if (IsExtractVecSrc) - if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || - Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR)) + if (!(isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val))) + return false; + } + if (IsExtractVecSrc) { + // Do not merge truncated stores here. + if (Other->isTruncatingStore()) return false; + if (!MemVT.bitsEq(Val.getValueType())) + return false; + if (Val.getOpcode() != ISD::EXTRACT_VECTOR_ELT && + Val.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return false; + } Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG); return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); }; + // We looking for a root node which is an ancestor to all mergable // stores. We search up through a load, to our root and then down // through all children. For instance we will find Store{1,2,3} if @@ -12612,10 +13044,8 @@ void DAGCombiner::getStoreMergeCandidates( // indirectly through its operand (we already consider dependencies // through the chain). Check in parallel by searching up from // non-chain operands of candidates. - bool DAGCombiner::checkMergeStoreCandidatesForDependencies( SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) { - // FIXME: We should be able to truncate a full search of // predecessors by doing a BFS and keeping tabs the originating // stores from which worklist nodes come from in a similar way to @@ -12648,12 +13078,13 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { return false; EVT MemVT = St->getMemoryVT(); - int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; + int64_t ElementSizeBytes = MemVT.getStoreSize(); + unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) return false; - bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute( + bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); // This function cannot currently deal with non-byte-sized memory sizes. @@ -12665,7 +13096,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { // Perform an early exit check. Do not bother looking at stored values that // are not constants, loads, or extracted vector elements. - SDValue StoredVal = St->getValue(); + SDValue StoredVal = peekThroughBitcast(St->getValue()); bool IsLoadSrc = isa<LoadSDNode>(StoredVal); bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) || isa<ConstantFPSDNode>(StoredVal); @@ -12675,12 +13106,6 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) return false; - // Don't merge vectors into wider vectors if the source data comes from loads. - // TODO: This restriction can be lifted by using logic similar to the - // ExtractVecSrc case. - if (MemVT.isVector() && IsLoadSrc) - return false; - SmallVector<MemOpLink, 8> StoreNodes; // Find potential store merge candidates by searching through chain sub-DAG getStoreMergeCandidates(St, StoreNodes); @@ -12759,19 +13184,20 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { unsigned LastLegalVectorType = 1; bool LastIntegerTrunc = false; bool NonZero = false; + unsigned FirstZeroAfterNonZero = NumConsecutiveStores; for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode); SDValue StoredVal = ST->getValue(); - - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) { - NonZero |= !C->isNullValue(); - } else if (ConstantFPSDNode *C = - dyn_cast<ConstantFPSDNode>(StoredVal)) { - NonZero |= !C->getConstantFPValue()->isNullValue(); - } else { - // Non-constant. - break; + bool IsElementZero = false; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) + IsElementZero = C->isNullValue(); + else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) + IsElementZero = C->getConstantFPValue()->isNullValue(); + if (IsElementZero) { + if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores) + FirstZeroAfterNonZero = i; } + NonZero |= !IsElementZero; // Find a legal type for the constant store. unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; @@ -12791,8 +13217,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) && - TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign, &IsFast) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFast) && IsFast) { LastIntegerTrunc = true; LastLegalType = i + 1; @@ -12806,13 +13232,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && !NoVectors) { // Find a legal type for the vector store. - unsigned Elts = i + 1; - if (MemVT.isVector()) { - // When merging vector stores, get the total number of elements. - Elts *= MemVT.getVectorNumElements(); - } + unsigned Elts = (i + 1) * NumMemElts; EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); - if (TLI.isTypeLegal(Ty) && + if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, FirstStoreAlign, &IsFast) && @@ -12821,23 +13243,34 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { } } + bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; + unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; + // Check if we found a legal integer type that creates a meaningful merge. - if (LastLegalType < 2 && LastLegalVectorType < 2) { - StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); + if (NumElem < 2) { + // We know that candidate stores are in order and of correct + // shape. While there is no mergeable sequence from the + // beginning one may start later in the sequence. The only + // reason a merge of size N could have failed where another of + // the same size would not have, is if the alignment has + // improved or we've dropped a non-zero value. Drop as many + // candidates as we can here. + unsigned NumSkip = 1; + while ( + (NumSkip < NumConsecutiveStores) && + (NumSkip < FirstZeroAfterNonZero) && + (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) { + NumSkip++; + } + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); continue; } - bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; - unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; - bool Merged = MergeStoresOfConstantsOrVecElts( StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc); - if (!Merged) { - StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); - continue; - } + RV |= Merged; + // Remove merged stores for next iteration. - RV = true; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); continue; } @@ -12849,25 +13282,20 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { unsigned FirstStoreAS = FirstInChain->getAddressSpace(); unsigned FirstStoreAlign = FirstInChain->getAlignment(); unsigned NumStoresToMerge = 1; - bool IsVec = MemVT.isVector(); for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - unsigned StoreValOpcode = St->getValue().getOpcode(); + SDValue StVal = peekThroughBitcast(St->getValue()); // This restriction could be loosened. // Bail out if any stored values are not elements extracted from a // vector. It should be possible to handle mixed sources, but load // sources need more careful handling (see the block of code below that // handles consecutive loads). - if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT && - StoreValOpcode != ISD::EXTRACT_SUBVECTOR) + if (StVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT && + StVal.getOpcode() != ISD::EXTRACT_SUBVECTOR) return RV; // Find a legal type for the vector store. - unsigned Elts = i + 1; - if (IsVec) { - // When merging vector stores, get the total number of elements. - Elts *= MemVT.getVectorNumElements(); - } + unsigned Elts = (i + 1) * NumMemElts; EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); bool IsFast; @@ -12879,6 +13307,23 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { NumStoresToMerge = i + 1; } + // Check if we found a legal integer type that creates a meaningful merge. + if (NumStoresToMerge < 2) { + // We know that candidate stores are in order and of correct + // shape. While there is no mergeable sequence from the + // beginning one may start later in the sequence. The only + // reason a merge of size N could have failed where another of + // the same size would not have, is if the alignment has + // improved. Drop as many candidates as we can here. + unsigned NumSkip = 1; + while ((NumSkip < NumConsecutiveStores) && + (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) + NumSkip++; + + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); + continue; + } + bool Merged = MergeStoresOfConstantsOrVecElts( StoreNodes, MemVT, NumStoresToMerge, false, true, false); if (!Merged) { @@ -12905,7 +13350,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { BaseIndexOffset LdBasePtr; for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue()); + SDValue Val = peekThroughBitcast(St->getValue()); + LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val); if (!Ld) break; @@ -12917,10 +13363,6 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (Ld->isVolatile() || Ld->isIndexed()) break; - // We do not accept ext loads. - if (Ld->getExtensionType() != ISD::NON_EXTLOAD) - break; - // The stored memory type must be the same. if (Ld->getMemoryVT() != MemVT) break; @@ -12986,7 +13428,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { isDereferenceable = false; // Find a legal type for the vector store. - EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1); + unsigned Elts = (i + 1) * NumMemElts; + EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); + bool IsFastSt, IsFastLd; if (TLI.isTypeLegal(StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && @@ -13023,8 +13467,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && - TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign, &IsFastSt) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, FirstLoadAlign, &IsFastLd) && @@ -13047,7 +13491,19 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { NumElem = std::min(LastLegalType, NumElem); if (NumElem < 2) { - StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); + // We know that candidate stores are in order and of correct + // shape. While there is no mergeable sequence from the + // beginning one may start later in the sequence. The only + // reason a merge of size N could have failed where another of + // the same size would not have is if the alignment or either + // the load or store has improved. Drop as many candidates as we + // can here. + unsigned NumSkip = 1; + while ((NumSkip < LoadNodes.size()) && + (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) && + (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) + NumSkip++; + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); continue; } @@ -13055,7 +13511,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { // to memory. EVT JointMemOpVT; if (UseVectorTy) { - JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem); + // Find a legal type for the vector store. + unsigned Elts = NumElem * NumMemElts; + JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); } else { unsigned SizeInBits = NumElem * ElementSizeBytes * 8; JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); @@ -13104,12 +13562,17 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { SDValue(NewLoad.getNode(), 1)); } - // Replace the all stores with the new store. - for (unsigned i = 0; i < NumElem; ++i) + // Replace the all stores with the new store. Recursively remove + // corresponding value if its no longer used. + for (unsigned i = 0; i < NumElem; ++i) { + SDValue Val = StoreNodes[i].MemNode->getOperand(1); CombineTo(StoreNodes[i].MemNode, NewStore); + if (Val.getNode()->use_empty()) + recursivelyDeleteUnusedNodes(Val.getNode()); + } + RV = true; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); - continue; } return RV; } @@ -13284,7 +13747,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { // See if we can simplify the input to this truncstore with knowledge that // only the low bits are being used. For example: // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" - SDValue Shorter = GetDemandedBits( + SDValue Shorter = DAG.GetDemandedBits( Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), ST->getMemoryVT().getScalarSizeInBits())); AddToWorklist(Value.getNode()); @@ -13356,11 +13819,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { Ptr, ST->getMemoryVT(), ST->getMemOperand()); } - // Only perform this optimization before the types are legal, because we - // don't want to perform this optimization on every DAGCombine invocation. - if ((TLI.mergeStoresAfterLegalization()) ? Level == AfterLegalizeDAG - : !LegalTypes) { - for (;;) { + // Always perform this optimization before types are legal. If the target + // prefers, also try this after legalization to catch stores that were created + // by intrinsics or other nodes. + if (!LegalTypes || (TLI.mergeStoresAfterLegalization())) { + while (true) { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so // or until we merge the last store on the chain. @@ -13499,6 +13962,60 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { return St1; } +/// Convert a disguised subvector insertion into a shuffle: +/// insert_vector_elt V, (bitcast X from vector type), IdxC --> +/// bitcast(shuffle (bitcast V), (extended X), Mask) +/// Note: We do not use an insert_subvector node because that requires a legal +/// subvector type. +SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { + SDValue InsertVal = N->getOperand(1); + if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() || + !InsertVal.getOperand(0).getValueType().isVector()) + return SDValue(); + + SDValue SubVec = InsertVal.getOperand(0); + SDValue DestVec = N->getOperand(0); + EVT SubVecVT = SubVec.getValueType(); + EVT VT = DestVec.getValueType(); + unsigned NumSrcElts = SubVecVT.getVectorNumElements(); + unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits(); + unsigned NumMaskVals = ExtendRatio * NumSrcElts; + + // Step 1: Create a shuffle mask that implements this insert operation. The + // vector that we are inserting into will be operand 0 of the shuffle, so + // those elements are just 'i'. The inserted subvector is in the first + // positions of operand 1 of the shuffle. Example: + // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7} + SmallVector<int, 16> Mask(NumMaskVals); + for (unsigned i = 0; i != NumMaskVals; ++i) { + if (i / NumSrcElts == InsIndex) + Mask[i] = (i % NumSrcElts) + NumMaskVals; + else + Mask[i] = i; + } + + // Bail out if the target can not handle the shuffle we want to create. + EVT SubVecEltVT = SubVecVT.getVectorElementType(); + EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals); + if (!TLI.isShuffleMaskLegal(Mask, ShufVT)) + return SDValue(); + + // Step 2: Create a wide vector from the inserted source vector by appending + // undefined elements. This is the same size as our destination vector. + SDLoc DL(N); + SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT)); + ConcatOps[0] = SubVec; + SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps); + + // Step 3: Shuffle in the padded subvector. + SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec); + SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask); + AddToWorklist(PaddedSubV.getNode()); + AddToWorklist(DestVecBC.getNode()); + AddToWorklist(Shuf.getNode()); + return DAG.getBitcast(VT, Shuf); +} + SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); @@ -13511,10 +14028,20 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { EVT VT = InVec.getValueType(); - // Check that we know which element is being inserted - if (!isa<ConstantSDNode>(EltNo)) + // Remove redundant insertions: + // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x + if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1)) + return InVec; + + // We must know which element is being inserted for folds below here. + auto *IndexC = dyn_cast<ConstantSDNode>(EltNo); + if (!IndexC) return SDValue(); - unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); + unsigned Elt = IndexC->getZExtValue(); + + if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) + return Shuf; // Canonicalize insert_vector_elt dag nodes. // Example: @@ -13692,9 +14219,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // converts. } - // extract_vector_elt (v2i32 (bitcast i64:x)), 0 -> i32 (trunc i64:x) + // extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x) + bool isLE = DAG.getDataLayout().isLittleEndian(); + unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1; if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() && - ConstEltNo->isNullValue() && VT.isInteger()) { + ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) { SDValue BCSrc = InVec.getOperand(0); if (BCSrc.getValueType().isScalarInteger()) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc); @@ -13748,7 +14277,10 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // FIXME: We should handle recursing on other vector shuffles and // scalar_to_vector here as well. - if (!LegalOperations) { + if (!LegalOperations || + // FIXME: Should really be just isOperationLegalOrCustom. + TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VT) || + TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VT)) { EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout()); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec, DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy)); @@ -14054,10 +14586,18 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, EVT InVT1 = VecIn1.getValueType(); EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; - unsigned Vec2Offset = InVT1.getVectorNumElements(); + unsigned Vec2Offset = 0; unsigned NumElems = VT.getVectorNumElements(); unsigned ShuffleNumElems = NumElems; + // In case both the input vectors are extracted from same base + // vector we do not need extra addend (Vec2Offset) while + // computing shuffle mask. + if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) || + !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) || + !(VecIn1.getOperand(0) == VecIn2.getOperand(0))) + Vec2Offset = InVT1.getVectorNumElements(); + // We can't generate a shuffle node with mismatched input and output types. // Try to make the types match the type of the output. if (InVT1 != VT || InVT2 != VT) { @@ -14072,7 +14612,7 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); VecIn2 = SDValue(); } else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) { - if (!TLI.isExtractSubvectorCheap(VT, NumElems)) + if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems)) return SDValue(); if (!VecIn2.getNode()) { @@ -14204,7 +14744,6 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa<ConstantSDNode>(Op.getOperand(1))) return SDValue(); - SDValue ExtractedFromVec = Op.getOperand(0); // All inputs must have the same element type as the output. @@ -14227,6 +14766,50 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { if (VecIn.size() < 2) return SDValue(); + // If all the Operands of BUILD_VECTOR extract from same + // vector, then split the vector efficiently based on the maximum + // vector access index and adjust the VectorMask and + // VecIn accordingly. + if (VecIn.size() == 2) { + unsigned MaxIndex = 0; + unsigned NearestPow2 = 0; + SDValue Vec = VecIn.back(); + EVT InVT = Vec.getValueType(); + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SmallVector<unsigned, 8> IndexVec(NumElems, 0); + + for (unsigned i = 0; i < NumElems; i++) { + if (VectorMask[i] <= 0) + continue; + unsigned Index = N->getOperand(i).getConstantOperandVal(1); + IndexVec[i] = Index; + MaxIndex = std::max(MaxIndex, Index); + } + + NearestPow2 = PowerOf2Ceil(MaxIndex); + if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 && + NumElems * 2 < NearestPow2) { + unsigned SplitSize = NearestPow2 / 2; + EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), + InVT.getVectorElementType(), SplitSize); + if (TLI.isTypeLegal(SplitVT)) { + SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, + DAG.getConstant(SplitSize, DL, IdxTy)); + SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, + DAG.getConstant(0, DL, IdxTy)); + VecIn.pop_back(); + VecIn.push_back(VecIn1); + VecIn.push_back(VecIn2); + + for (unsigned i = 0; i < NumElems; i++) { + if (VectorMask[i] <= 0) + continue; + VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2; + } + } + } + } + // TODO: We want to sort the vectors by descending length, so that adjacent // pairs have similar length, and the longer vector is always first in the // pair. @@ -14315,77 +14898,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask); } } - return Shuffles[0]; } -// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT -// operations which can be matched to a truncate. -SDValue DAGCombiner::reduceBuildVecToTrunc(SDNode *N) { - // TODO: Add support for big-endian. - if (DAG.getDataLayout().isBigEndian()) - return SDValue(); - if (N->getNumOperands() < 2) - return SDValue(); - SDLoc DL(N); - EVT VT = N->getValueType(0); - unsigned NumElems = N->getNumOperands(); - - if (!isTypeLegal(VT)) - return SDValue(); - - // If the input is something other than an EXTRACT_VECTOR_ELT with a constant - // index, bail out. - // TODO: Allow undef elements in some cases? - if (any_of(N->ops(), [VT](SDValue Op) { - return Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isa<ConstantSDNode>(Op.getOperand(1)) || - Op.getValueType() != VT.getVectorElementType(); - })) - return SDValue(); - - // Helper for obtaining an EXTRACT_VECTOR_ELT's constant index - auto GetExtractIdx = [](SDValue Extract) { - return cast<ConstantSDNode>(Extract.getOperand(1))->getSExtValue(); - }; - - // The first BUILD_VECTOR operand must be an an extract from index zero - // (assuming no undef and little-endian). - if (GetExtractIdx(N->getOperand(0)) != 0) - return SDValue(); - - // Compute the stride from the first index. - int Stride = GetExtractIdx(N->getOperand(1)); - SDValue ExtractedFromVec = N->getOperand(0).getOperand(0); - - // Proceed only if the stride and the types can be matched to a truncate. - if ((Stride == 1 || !isPowerOf2_32(Stride)) || - (ExtractedFromVec.getValueType().getVectorNumElements() != - Stride * NumElems) || - (VT.getScalarSizeInBits() * Stride > 64)) - return SDValue(); - - // Check remaining operands are consistent with the computed stride. - for (unsigned i = 1; i != NumElems; ++i) { - SDValue Op = N->getOperand(i); - - if ((Op.getOperand(0) != ExtractedFromVec) || - (GetExtractIdx(Op) != Stride * i)) - return SDValue(); - } - - // All checks were ok, construct the truncate. - LLVMContext &Ctx = *DAG.getContext(); - EVT NewVT = VT.getVectorVT( - Ctx, EVT::getIntegerVT(Ctx, VT.getScalarSizeInBits() * Stride), NumElems); - EVT TruncVT = - VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; - - SDValue Res = DAG.getBitcast(NewVT, ExtractedFromVec); - Res = DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, Res); - return DAG.getBitcast(VT, Res); -} - SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { EVT VT = N->getValueType(0); @@ -14428,10 +14943,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N)) return V; - if (TLI.isDesirableToCombineBuildVectorToTruncate()) - if (SDValue V = reduceBuildVecToTrunc(N)) - return V; - if (SDValue V = reduceBuildVecToShuffle(N)) return V; @@ -14514,8 +15025,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { for (SDValue Op : N->ops()) { // Peek through any bitcast. - while (Op.getOpcode() == ISD::BITCAST) - Op = Op.getOperand(0); + Op = peekThroughBitcast(Op); // UNDEF nodes convert to UNDEF shuffle mask values. if (Op.isUndef()) { @@ -14534,8 +15044,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { EVT ExtVT = ExtVec.getValueType(); // Peek through any bitcast. - while (ExtVec.getOpcode() == ISD::BITCAST) - ExtVec = ExtVec.getOperand(0); + ExtVec = peekThroughBitcast(ExtVec); // UNDEF nodes convert to UNDEF shuffle mask values. if (ExtVec.isUndef()) { @@ -14760,9 +15269,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // We are looking for an optionally bitcasted wide vector binary operator // feeding an extract subvector. - SDValue BinOp = Extract->getOperand(0); - if (BinOp.getOpcode() == ISD::BITCAST) - BinOp = BinOp.getOperand(0); + SDValue BinOp = peekThroughBitcast(Extract->getOperand(0)); // TODO: The motivating case for this transform is an x86 AVX1 target. That // target has temptingly almost legal versions of bitwise logic ops in 256-bit @@ -14786,13 +15293,8 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { return SDValue(); // Peek through bitcasts of the binary operator operands if needed. - SDValue LHS = BinOp.getOperand(0); - if (LHS.getOpcode() == ISD::BITCAST) - LHS = LHS.getOperand(0); - - SDValue RHS = BinOp.getOperand(1); - if (RHS.getOpcode() == ISD::BITCAST) - RHS = RHS.getOperand(0); + SDValue LHS = peekThroughBitcast(BinOp.getOperand(0)); + SDValue RHS = peekThroughBitcast(BinOp.getOperand(1)); // We need at least one concatenation operation of a binop operand to make // this transform worthwhile. The concat must double the input vector sizes. @@ -14891,8 +15393,34 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { } // Skip bitcasting - if (V->getOpcode() == ISD::BITCAST) - V = V.getOperand(0); + V = peekThroughBitcast(V); + + // If the input is a build vector. Try to make a smaller build vector. + if (V->getOpcode() == ISD::BUILD_VECTOR) { + if (auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + EVT InVT = V->getValueType(0); + unsigned ExtractSize = NVT.getSizeInBits(); + unsigned EltSize = InVT.getScalarSizeInBits(); + // Only do this if we won't split any elements. + if (ExtractSize % EltSize == 0) { + unsigned NumElems = ExtractSize / EltSize; + EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(), + InVT.getVectorElementType(), NumElems); + if ((!LegalOperations || + TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT)) && + (!LegalTypes || TLI.isTypeLegal(ExtractVT))) { + unsigned IdxVal = (Idx->getZExtValue() * NVT.getScalarSizeInBits()) / + EltSize; + + // Extract the pieces from the original build_vector. + SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N), + makeArrayRef(V->op_begin() + IdxVal, + NumElems)); + return DAG.getBitcast(NVT, BuildVec); + } + } + } + } if (V->getOpcode() == ISD::INSERT_SUBVECTOR) { // Handle only simple case where vector being inserted and vector @@ -15013,6 +15541,37 @@ static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0, return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask()); } +static SDValue simplifyShuffleMask(ShuffleVectorSDNode *SVN, SDValue N0, + SDValue N1, SelectionDAG &DAG) { + auto isUndefElt = [](SDValue V, int Idx) { + // TODO - handle more cases as required. + if (V.getOpcode() == ISD::BUILD_VECTOR) + return V.getOperand(Idx).isUndef(); + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) + return (Idx != 0) || V.getOperand(0).isUndef(); + return false; + }; + + EVT VT = SVN->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + + bool Changed = false; + SmallVector<int, 8> NewMask; + for (unsigned i = 0; i != NumElts; ++i) { + int Idx = SVN->getMaskElt(i); + if ((0 <= Idx && Idx < (int)NumElts && isUndefElt(N0, Idx)) || + ((int)NumElts < Idx && isUndefElt(N1, Idx - NumElts))) { + Changed = true; + Idx = -1; + } + NewMask.push_back(Idx); + } + if (Changed) + return DAG.getVectorShuffle(VT, SDLoc(SVN), N0, N1, NewMask); + + return SDValue(); +} + // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, // or turn a shuffle of a single concat into simpler shuffle then concat. static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { @@ -15091,7 +15650,7 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { // // To deal with this, we currently use a bunch of mostly arbitrary heuristics. // We don't fold shuffles where one side is a non-zero constant, and we don't -// fold shuffles if the resulting BUILD_VECTOR would have duplicate +// fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate // non-constant operands. This seems to work out reasonably well in practice. static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, SelectionDAG &DAG, @@ -15103,6 +15662,7 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, if (!N0->hasOneUse() || !N1->hasOneUse()) return SDValue(); + // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as // discussed above. if (!N1.isUndef()) { @@ -15114,6 +15674,15 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, return SDValue(); } + // If both inputs are splats of the same value then we can safely merge this + // to a single BUILD_VECTOR with undef elements based on the shuffle mask. + bool IsSplat = false; + auto *BV0 = dyn_cast<BuildVectorSDNode>(N0); + auto *BV1 = dyn_cast<BuildVectorSDNode>(N1); + if (BV0 && BV1) + if (SDValue Splat0 = BV0->getSplatValue()) + IsSplat = (Splat0 == BV1->getSplatValue()); + SmallVector<SDValue, 8> Ops; SmallSet<SDValue, 16> DuplicateOps; for (int M : SVN->getMask()) { @@ -15124,23 +15693,25 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, if (S.getOpcode() == ISD::BUILD_VECTOR) { Op = S.getOperand(Idx); } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) { - if (Idx == 0) - Op = S.getOperand(0); + assert(Idx == 0 && "Unexpected SCALAR_TO_VECTOR operand index."); + Op = S.getOperand(0); } else { // Operand can't be combined - bail out. return SDValue(); } } - // Don't duplicate a non-constant BUILD_VECTOR operand; semantically, this is - // fine, but it's likely to generate low-quality code if the target can't - // reconstruct an appropriate shuffle. + // Don't duplicate a non-constant BUILD_VECTOR operand unless we're + // generating a splat; semantically, this is fine, but it's likely to + // generate low-quality code if the target can't reconstruct an appropriate + // shuffle. if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op)) - if (!DuplicateOps.insert(Op).second) + if (!IsSplat && !DuplicateOps.insert(Op).second) return SDValue(); Ops.push_back(Op); } + // BUILD_VECTOR requires all inputs to be of the same type, find the // maximum type and extend them all. EVT SVT = VT.getScalarType(); @@ -15162,7 +15733,8 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, SelectionDAG &DAG, const TargetLowering &TLI, - bool LegalOperations) { + bool LegalOperations, + bool LegalTypes) { EVT VT = SVN->getValueType(0); bool IsBigEndian = DAG.getDataLayout().isBigEndian(); @@ -15190,14 +15762,18 @@ static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for // power-of-2 extensions as they are the most likely. for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { + // Check for non power of 2 vector sizes + if (NumElts % Scale != 0) + continue; if (!isAnyExtend(Scale)) continue; EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale); EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); - if (!LegalOperations || - TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) - return DAG.getBitcast(VT, + if (!LegalTypes || TLI.isTypeLegal(OutVT)) + if (!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) + return DAG.getBitcast(VT, DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT)); } @@ -15218,9 +15794,7 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, if (!VT.isInteger() || IsBigEndian) return SDValue(); - SDValue N0 = SVN->getOperand(0); - while (N0.getOpcode() == ISD::BITCAST) - N0 = N0.getOperand(0); + SDValue N0 = peekThroughBitcast(SVN->getOperand(0)); unsigned Opcode = N0.getOpcode(); if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && @@ -15316,6 +15890,84 @@ static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask, NewMask); } +/// If the shuffle mask is taking exactly one element from the first vector +/// operand and passing through all other elements from the second vector +/// operand, return the index of the mask element that is choosing an element +/// from the first operand. Otherwise, return -1. +static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) { + int MaskSize = Mask.size(); + int EltFromOp0 = -1; + // TODO: This does not match if there are undef elements in the shuffle mask. + // Should we ignore undefs in the shuffle mask instead? The trade-off is + // removing an instruction (a shuffle), but losing the knowledge that some + // vector lanes are not needed. + for (int i = 0; i != MaskSize; ++i) { + if (Mask[i] >= 0 && Mask[i] < MaskSize) { + // We're looking for a shuffle of exactly one element from operand 0. + if (EltFromOp0 != -1) + return -1; + EltFromOp0 = i; + } else if (Mask[i] != i + MaskSize) { + // Nothing from operand 1 can change lanes. + return -1; + } + } + return EltFromOp0; +} + +/// If a shuffle inserts exactly one element from a source vector operand into +/// another vector operand and we can access the specified element as a scalar, +/// then we can eliminate the shuffle. +static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, + SelectionDAG &DAG) { + // First, check if we are taking one element of a vector and shuffling that + // element into another vector. + ArrayRef<int> Mask = Shuf->getMask(); + SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end()); + SDValue Op0 = Shuf->getOperand(0); + SDValue Op1 = Shuf->getOperand(1); + int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask); + if (ShufOp0Index == -1) { + // Commute mask and check again. + ShuffleVectorSDNode::commuteMask(CommutedMask); + ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask); + if (ShufOp0Index == -1) + return SDValue(); + // Commute operands to match the commuted shuffle mask. + std::swap(Op0, Op1); + Mask = CommutedMask; + } + + // The shuffle inserts exactly one element from operand 0 into operand 1. + // Now see if we can access that element as a scalar via a real insert element + // instruction. + // TODO: We can try harder to locate the element as a scalar. Examples: it + // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant. + assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && + "Shuffle mask value must be from operand 0"); + if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT) + return SDValue(); + + auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2)); + if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index]) + return SDValue(); + + // There's an existing insertelement with constant insertion index, so we + // don't need to check the legality/profitability of a replacement operation + // that differs at most in the constant value. The target should be able to + // lower any of those in a similar way. If not, legalization will expand this + // to a scalar-to-vector plus shuffle. + // + // Note that the shuffle may move the scalar from the position that the insert + // element used. Therefore, our new insert element occurs at the shuffle's + // mask index value, not the insert's index value. + // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' + SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf), + Op0.getOperand(2).getValueType()); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), + Op1, Op0.getOperand(1), NewInsIndex); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -15362,6 +16014,13 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); } + // Simplify shuffle mask if a referenced element is UNDEF. + if (SDValue V = simplifyShuffleMask(SVN, N0, N1, DAG)) + return V; + + if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG)) + return InsElt; + // A shuffle of a single vector that is a splat can always be folded. if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0)) if (N1->isUndef() && N0Shuf->isSplat()) @@ -15426,7 +16085,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return S; // Match shuffles that can be converted to any_vector_extend_in_reg. - if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) + if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes)) return V; // Combine "truncate_vector_in_reg" style shuffles. @@ -15486,7 +16145,6 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { if (TLI.isTypeLegal(ScaleVT) && 0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) && 0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) { - int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits(); int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits(); @@ -15661,23 +16319,46 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) { EVT VT = N->getValueType(0); // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern - // with a VECTOR_SHUFFLE. + // with a VECTOR_SHUFFLE and possible truncate. if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue InVec = InVal->getOperand(0); SDValue EltNo = InVal->getOperand(1); - - // FIXME: We could support implicit truncation if the shuffle can be - // scaled to a smaller vector scalar type. - ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo); - if (C0 && VT == InVec.getValueType() && - VT.getScalarType() == InVal.getValueType()) { - SmallVector<int, 8> NewMask(VT.getVectorNumElements(), -1); + auto InVecT = InVec.getValueType(); + if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) { + SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1); int Elt = C0->getZExtValue(); NewMask[0] = Elt; - - if (TLI.isShuffleMaskLegal(NewMask, VT)) - return DAG.getVectorShuffle(VT, SDLoc(N), InVec, DAG.getUNDEF(VT), - NewMask); + SDValue Val; + // If we have an implict truncate do truncate here as long as it's legal. + // if it's not legal, this should + if (VT.getScalarType() != InVal.getValueType() && + InVal.getValueType().isScalarInteger() && + isTypeLegal(VT.getScalarType())) { + Val = + DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal); + return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val); + } + if (VT.getScalarType() == InVecT.getScalarType() && + VT.getVectorNumElements() <= InVecT.getVectorNumElements() && + TLI.isShuffleMaskLegal(NewMask, VT)) { + Val = DAG.getVectorShuffle(InVecT, SDLoc(N), InVec, + DAG.getUNDEF(InVecT), NewMask); + // If the initial vector is the correct size this shuffle is a + // valid result. + if (VT == InVecT) + return Val; + // If not we must truncate the vector. + if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) { + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy); + EVT SubVT = + EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(), + VT.getVectorNumElements()); + Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, Val, + ZeroIdx); + return Val; + } + } } } @@ -15694,12 +16375,47 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { if (N1.isUndef()) return N0; + // For nested INSERT_SUBVECTORs, attempt to combine inner node first to allow + // us to pull BITCASTs from input to output. + if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR) + if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode())) + return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2); + // If this is an insert of an extracted vector into an undef vector, we can // just use the input to the extract. if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) return N1.getOperand(0); + // If we are inserting a bitcast value into an undef, with the same + // number of elements, just use the bitcast input of the extract. + // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 -> + // BITCAST (INSERT_SUBVECTOR UNDEF N1 N2) + if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST && + N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && + N1.getOperand(0).getOperand(1) == N2 && + N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() == + VT.getVectorNumElements()) { + return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0)); + } + + // If both N1 and N2 are bitcast values on which insert_subvector + // would makes sense, pull the bitcast through. + // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 -> + // BITCAST (INSERT_SUBVECTOR N0 N1 N2) + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) { + SDValue CN0 = N0.getOperand(0); + SDValue CN1 = N1.getOperand(0); + if (CN0.getValueType().getVectorElementType() == + CN1.getValueType().getVectorElementType() && + CN0.getValueType().getVectorNumElements() == + VT.getVectorNumElements()) { + SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), + CN0.getValueType(), CN0, CN1, N2); + return DAG.getBitcast(VT, NewINSERT); + } + } + // Combine INSERT_SUBVECTORs where we are inserting to the same index. // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx ) // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) @@ -15779,7 +16495,7 @@ SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) { SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); + SDValue RHS = peekThroughBitcast(N->getOperand(1)); SDLoc DL(N); // Make sure we're not running after operation legalization where it @@ -15790,9 +16506,6 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { if (N->getOpcode() != ISD::AND) return SDValue(); - if (RHS.getOpcode() == ISD::BITCAST) - RHS = RHS.getOperand(0); - if (RHS.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); @@ -15945,7 +16658,6 @@ SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, /// the DAG combiner loop to avoid it being looked at. bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, SDValue RHS) { - // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) // The select + setcc is redundant, because fsqrt returns NaN for X < 0. if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) { @@ -16418,7 +17130,7 @@ SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, SDValue DAGCombiner::BuildSDIV(SDNode *N) { // when optimising for minimum size, we don't want to expand a div to a mul // and a shift. - if (DAG.getMachineFunction().getFunction()->optForMinSize()) + if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); @@ -16429,7 +17141,7 @@ SDValue DAGCombiner::BuildSDIV(SDNode *N) { if (C->isNullValue()) return SDValue(); - std::vector<SDNode*> Built; + std::vector<SDNode *> Built; SDValue S = TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); @@ -16464,7 +17176,7 @@ SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) { SDValue DAGCombiner::BuildUDIV(SDNode *N) { // when optimising for minimum size, we don't want to expand a div to a mul // and a shift. - if (DAG.getMachineFunction().getFunction()->optForMinSize()) + if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); @@ -16475,7 +17187,7 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) { if (C->isNullValue()) return SDValue(); - std::vector<SDNode*> Built; + std::vector<SDNode *> Built; SDValue S = TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); @@ -16760,8 +17472,8 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { if (Op1->isInvariant() && Op0->writeMem()) return false; - unsigned NumBytes0 = Op0->getMemoryVT().getSizeInBits() >> 3; - unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3; + unsigned NumBytes0 = Op0->getMemoryVT().getStoreSize(); + unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize(); // Check for BaseIndexOffset matching. BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG); @@ -16957,7 +17669,11 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain /// (aliasing node.) SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { - SmallVector<SDValue, 8> Aliases; // Ops for replacing token factor. + if (OptLevel == CodeGenOpt::None) + return OldChain; + + // Ops for replacing token factor. + SmallVector<SDValue, 8> Aliases; // Accumulate all the aliases to this node. GatherAllAliases(N, OldChain, Aliases); @@ -16987,6 +17703,9 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { // to go from a partially-merged state to the desired final // fully-merged state. bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { + if (OptLevel == CodeGenOpt::None) + return false; + // This holds the base pointer, index, and the offset in bytes from the base // pointer. BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); |