summaryrefslogtreecommitdiff
path: root/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-12-18 20:10:56 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-12-18 20:10:56 +0000
commit044eb2f6afba375a914ac9d8024f8f5142bb912e (patch)
tree1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/CodeGen/SelectionDAG/DAGCombiner.cpp
parenteb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff)
Notes
Diffstat (limited to 'lib/CodeGen/SelectionDAG/DAGCombiner.cpp')
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp2497
1 files changed, 1608 insertions, 889 deletions
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 432c86dd6f1e1..f97732c1c49d0 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1,4 +1,4 @@
-//===-- DAGCombiner.cpp - Implement a DAG node combiner -------------------===//
+//===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -16,32 +16,64 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
using namespace llvm;
#define DEBUG_TYPE "dagcombine"
@@ -53,43 +85,41 @@ STATISTIC(OpsNarrowed , "Number of load/op/store narrowed");
STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int");
STATISTIC(SlicedLoads, "Number of load sliced");
-namespace {
- static cl::opt<bool>
- CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
- cl::desc("Enable DAG combiner's use of IR alias analysis"));
+static cl::opt<bool>
+CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
+ cl::desc("Enable DAG combiner's use of IR alias analysis"));
- static cl::opt<bool>
- UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
- cl::desc("Enable DAG combiner's use of TBAA"));
+static cl::opt<bool>
+UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
+ cl::desc("Enable DAG combiner's use of TBAA"));
#ifndef NDEBUG
- static cl::opt<std::string>
- CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
- cl::desc("Only use DAG-combiner alias analysis in this"
- " function"));
+static cl::opt<std::string>
+CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
+ cl::desc("Only use DAG-combiner alias analysis in this"
+ " function"));
#endif
- /// Hidden option to stress test load slicing, i.e., when this option
- /// is enabled, load slicing bypasses most of its profitability guards.
- static cl::opt<bool>
- StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
- cl::desc("Bypass the profitability model of load "
- "slicing"),
- cl::init(false));
+/// Hidden option to stress test load slicing, i.e., when this option
+/// is enabled, load slicing bypasses most of its profitability guards.
+static cl::opt<bool>
+StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
+ cl::desc("Bypass the profitability model of load slicing"),
+ cl::init(false));
- static cl::opt<bool>
- MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
- cl::desc("DAG combiner may split indexing from loads"));
+static cl::opt<bool>
+ MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
+ cl::desc("DAG combiner may split indexing from loads"));
-//------------------------------ DAGCombiner ---------------------------------//
+namespace {
class DAGCombiner {
SelectionDAG &DAG;
const TargetLowering &TLI;
CombineLevel Level;
CodeGenOpt::Level OptLevel;
- bool LegalOperations;
- bool LegalTypes;
+ bool LegalOperations = false;
+ bool LegalTypes = false;
bool ForCodeSize;
/// \brief Worklist of all of the nodes that need to be simplified.
@@ -128,6 +158,19 @@ namespace {
SDValue visit(SDNode *N);
public:
+ DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
+ : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
+ OptLevel(OL), AA(AA) {
+ ForCodeSize = DAG.getMachineFunction().getFunction().optForSize();
+
+ MaximumLegalStoreInBits = 0;
+ for (MVT VT : MVT::all_valuetypes())
+ if (EVT(VT).isSimple() && VT != MVT::Other &&
+ TLI.isTypeLegal(EVT(VT)) &&
+ VT.getSizeInBits() >= MaximumLegalStoreInBits)
+ MaximumLegalStoreInBits = VT.getSizeInBits();
+ }
+
/// Add to the worklist making sure its instance is at the back (next to be
/// processed.)
void AddToWorklist(SDNode *N) {
@@ -285,7 +328,7 @@ namespace {
SDValue visitSIGN_EXTEND(SDNode *N);
SDValue visitZERO_EXTEND(SDNode *N);
SDValue visitANY_EXTEND(SDNode *N);
- SDValue visitAssertZext(SDNode *N);
+ SDValue visitAssertExt(SDNode *N);
SDValue visitSIGN_EXTEND_INREG(SDNode *N);
SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
@@ -348,6 +391,7 @@ namespace {
SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
SDValue foldSelectOfConstants(SDNode *N);
+ SDValue foldVSelectOfConstants(SDNode *N);
SDValue foldBinOpIntoSelect(SDNode *BO);
bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
@@ -371,6 +415,7 @@ namespace {
SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
SDValue CombineExtLoad(SDNode *N);
SDValue combineRepeatedFPDivisors(SDNode *N);
+ SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
SDValue BuildSDIV(SDNode *N);
SDValue BuildSDIVPow2(SDNode *N);
@@ -400,14 +445,11 @@ namespace {
SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
SDValue reduceBuildVecToShuffle(SDNode *N);
- SDValue reduceBuildVecToTrunc(SDNode *N);
SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
ArrayRef<int> VectorMask, SDValue VecIn1,
SDValue VecIn2, unsigned LeftIdx);
SDValue matchVSelectOpSizesWithSetCC(SDNode *N);
- SDValue GetDemandedBits(SDValue V, const APInt &Mask);
-
/// Walk up chain skipping non-aliasing memory nodes,
/// looking for aliasing nodes and adding them to the Aliases vector.
void GatherAllAliases(SDNode *N, SDValue OriginalChain,
@@ -434,12 +476,14 @@ namespace {
/// Holds a pointer to an LSBaseSDNode as well as information on where it
/// is located in a sequence of memory operations connected by a chain.
struct MemOpLink {
- MemOpLink(LSBaseSDNode *N, int64_t Offset)
- : MemNode(N), OffsetFromBase(Offset) {}
// Ptr to the mem node.
LSBaseSDNode *MemNode;
+
// Offset from the base ptr.
int64_t OffsetFromBase;
+
+ MemOpLink(LSBaseSDNode *N, int64_t Offset)
+ : MemNode(N), OffsetFromBase(Offset) {}
};
/// This is a helper function for visitMUL to check the profitability
@@ -450,38 +494,49 @@ namespace {
SDValue &AddNode,
SDValue &ConstNode);
-
/// This is a helper function for visitAND and visitZERO_EXTEND. Returns
/// true if the (and (load x) c) pattern matches an extload. ExtVT returns
- /// the type of the loaded value to be extended. LoadedVT returns the type
- /// of the original loaded value. NarrowLoad returns whether the load would
- /// need to be narrowed in order to match.
+ /// the type of the loaded value to be extended.
bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
- EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,
- bool &NarrowLoad);
+ EVT LoadResultTy, EVT &ExtVT);
+
+ /// Helper function to calculate whether the given Load can have its
+ /// width reduced to ExtVT.
+ bool isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType,
+ EVT &ExtVT, unsigned ShAmt = 0);
+
+ /// Used by BackwardsPropagateMask to find suitable loads.
+ bool SearchForAndLoads(SDNode *N, SmallPtrSetImpl<LoadSDNode*> &Loads,
+ SmallPtrSetImpl<SDNode*> &NodeWithConsts,
+ ConstantSDNode *Mask, SDNode *&UncombinedNode);
+ /// Attempt to propagate a given AND node back to load leaves so that they
+ /// can be combined into narrow loads.
+ bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG);
/// Helper function for MergeConsecutiveStores which merges the
/// component store chains.
SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
unsigned NumStores);
- /// This is a helper function for MergeConsecutiveStores. When the source
- /// elements of the consecutive stores are all constants or all extracted
- /// vector elements, try to merge them into one larger store.
- /// \return True if a merged store was created.
+ /// This is a helper function for MergeConsecutiveStores. When the
+ /// source elements of the consecutive stores are all constants or
+ /// all extracted vector elements, try to merge them into one
+ /// larger store introducing bitcasts if necessary. \return True
+ /// if a merged store was created.
bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
EVT MemVT, unsigned NumStores,
bool IsConstantSrc, bool UseVector,
bool UseTrunc);
- /// This is a helper function for MergeConsecutiveStores.
- /// Stores that may be merged are placed in StoreNodes.
+ /// This is a helper function for MergeConsecutiveStores. Stores
+ /// that potentially may be merged with St are placed in
+ /// StoreNodes.
void getStoreMergeCandidates(StoreSDNode *St,
SmallVectorImpl<MemOpLink> &StoreNodes);
/// Helper function for MergeConsecutiveStores. Checks if
- /// Candidate stores have indirect dependency through their
- /// operands. \return True if safe to merge
+ /// candidate stores have indirect dependency through their
+ /// operands. \return True if safe to merge.
bool checkMergeStoreCandidatesForDependencies(
SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores);
@@ -500,19 +555,6 @@ namespace {
SDValue distributeTruncateThroughAnd(SDNode *N);
public:
- DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
- : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
- OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) {
- ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
-
- MaximumLegalStoreInBits = 0;
- for (MVT VT : MVT::all_valuetypes())
- if (EVT(VT).isSimple() && VT != MVT::Other &&
- TLI.isTypeLegal(EVT(VT)) &&
- VT.getSizeInBits() >= MaximumLegalStoreInBits)
- MaximumLegalStoreInBits = VT.getSizeInBits();
- }
-
/// Runs the dag combiner on all nodes in the work list
void Run(CombineLevel AtLevel);
@@ -541,14 +583,12 @@ namespace {
return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
}
};
-}
-
-namespace {
/// This class is a DAGUpdateListener that removes any deleted
/// nodes from the worklist.
class WorklistRemover : public SelectionDAG::DAGUpdateListener {
DAGCombiner &DC;
+
public:
explicit WorklistRemover(DAGCombiner &dc)
: SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
@@ -557,7 +597,8 @@ public:
DC.removeFromWorklist(N);
}
};
-}
+
+} // end anonymous namespace
//===----------------------------------------------------------------------===//
// TargetLowering::DAGCombinerInfo implementation
@@ -577,7 +618,6 @@ CombineTo(SDNode *N, SDValue Res, bool AddTo) {
return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
}
-
SDValue TargetLowering::DAGCombinerInfo::
CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
@@ -873,6 +913,56 @@ static bool isAnyConstantBuildVector(const SDNode *N) {
ISD::isBuildVectorOfConstantFPSDNodes(N);
}
+// Attempt to match a unary predicate against a scalar/splat constant or
+// every element of a constant BUILD_VECTOR.
+static bool matchUnaryPredicate(SDValue Op,
+ std::function<bool(ConstantSDNode *)> Match) {
+ if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
+ return Match(Cst);
+
+ if (ISD::BUILD_VECTOR != Op.getOpcode())
+ return false;
+
+ EVT SVT = Op.getValueType().getScalarType();
+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+ auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i));
+ if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst))
+ return false;
+ }
+ return true;
+}
+
+// Attempt to match a binary predicate against a pair of scalar/splat constants
+// or every element of a pair of constant BUILD_VECTORs.
+static bool matchBinaryPredicate(
+ SDValue LHS, SDValue RHS,
+ std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match) {
+ if (LHS.getValueType() != RHS.getValueType())
+ return false;
+
+ if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS))
+ if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS))
+ return Match(LHSCst, RHSCst);
+
+ if (ISD::BUILD_VECTOR != LHS.getOpcode() ||
+ ISD::BUILD_VECTOR != RHS.getOpcode())
+ return false;
+
+ EVT SVT = LHS.getValueType().getScalarType();
+ for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
+ auto *LHSCst = dyn_cast<ConstantSDNode>(LHS.getOperand(i));
+ auto *RHSCst = dyn_cast<ConstantSDNode>(RHS.getOperand(i));
+ if (!LHSCst || !RHSCst)
+ return false;
+ if (LHSCst->getValueType(0) != SVT ||
+ LHSCst->getValueType(0) != RHSCst->getValueType(0))
+ return false;
+ if (!Match(LHSCst, RHSCst))
+ return false;
+ }
+ return true;
+}
+
SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
SDValue N1) {
EVT VT = N0.getValueType();
@@ -1123,10 +1213,10 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
Replace0 &= !N0->hasOneUse();
Replace1 &= (N0 != N1) && !N1->hasOneUse();
- // Combine Op here so it is presreved past replacements.
+ // Combine Op here so it is preserved past replacements.
CombineTo(Op.getNode(), RV);
- // If operands have a use ordering, make sur we deal with
+ // If operands have a use ordering, make sure we deal with
// predecessor first.
if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) {
std::swap(N0, N1);
@@ -1473,7 +1563,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N);
case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N);
case ISD::ANY_EXTEND: return visitANY_EXTEND(N);
- case ISD::AssertZext: return visitAssertZext(N);
+ case ISD::AssertSext:
+ case ISD::AssertZext: return visitAssertExt(N);
case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N);
case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
@@ -1572,15 +1663,15 @@ SDValue DAGCombiner::combine(SDNode *N) {
}
}
- // If N is a commutative binary node, try commuting it to enable more
- // sdisel CSE.
+ // If N is a commutative binary node, try eliminate it if the commuted
+ // version is already present in the DAG.
if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) &&
N->getNumValues() == 1) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Constant operands are canonicalized to RHS.
- if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) {
+ if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) {
SDValue Ops[] = {N1, N0};
SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
N->getFlags());
@@ -1632,7 +1723,6 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
// Check each of the operands.
for (const SDValue &Op : TF->op_values()) {
-
switch (Op.getOpcode()) {
case ISD::EntryToken:
// Entry tokens don't need to be added to the list. They are
@@ -1907,6 +1997,15 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
}
}
+
+ // Undo the add -> or combine to merge constant offsets from a frame index.
+ if (N0.getOpcode() == ISD::OR &&
+ isa<FrameIndexSDNode>(N0.getOperand(0)) &&
+ isa<ConstantSDNode>(N0.getOperand(1)) &&
+ DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {
+ SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1));
+ return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);
+ }
}
if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -2064,7 +2163,8 @@ SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference)
}
// (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
- if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)))
+ if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) &&
+ N1.getResNo() == 0)
return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(),
N0, N1.getOperand(0), N1.getOperand(2));
@@ -2537,6 +2637,12 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0);
N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
+ assert((!N0IsConst ||
+ ConstValue0.getBitWidth() == VT.getScalarSizeInBits()) &&
+ "Splat APInt should be element width");
+ assert((!N1IsConst ||
+ ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&
+ "Splat APInt should be element width");
} else {
N0IsConst = isa<ConstantSDNode>(N0);
if (N0IsConst) {
@@ -2562,12 +2668,8 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
// fold (mul x, 0) -> 0
if (N1IsConst && ConstValue1.isNullValue())
return N1;
- // We require a splat of the entire scalar bit width for non-contiguous
- // bit patterns.
- bool IsFullSplat =
- ConstValue1.getBitWidth() == VT.getScalarSizeInBits();
// fold (mul x, 1) -> x
- if (N1IsConst && ConstValue1.isOneValue() && IsFullSplat)
+ if (N1IsConst && ConstValue1.isOneValue())
return N0;
if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -2580,16 +2682,20 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
DAG.getConstant(0, DL, VT), N0);
}
// fold (mul x, (1 << c)) -> x << c
- if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isPowerOf2() &&
- IsFullSplat) {
+ if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
+ DAG.isKnownToBeAPowerOfTwo(N1) &&
+ (!VT.isVector() || Level <= AfterLegalizeVectorOps)) {
SDLoc DL(N);
- return DAG.getNode(ISD::SHL, DL, VT, N0,
- DAG.getConstant(ConstValue1.logBase2(), DL,
- getShiftAmountTy(N0.getValueType())));
+ SDValue LogBase2 = BuildLogBase2(N1, DL);
+ AddToWorklist(LogBase2.getNode());
+
+ EVT ShiftVT = getShiftAmountTy(N0.getValueType());
+ SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
+ AddToWorklist(Trunc.getNode());
+ return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
}
// fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
- if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2() &&
- IsFullSplat) {
+ if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) {
unsigned Log2Val = (-ConstValue1).logBase2();
SDLoc DL(N);
// FIXME: If the input is something that is easily negated (e.g. a
@@ -2835,7 +2941,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
// If integer divide is expensive and we satisfy the requirements, emit an
// alternate sequence. Targets may check function attributes for size/speed
// trade-offs.
- AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
if (SDValue Op = BuildSDIV(N))
return Op;
@@ -2906,7 +3012,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
}
// fold (udiv x, c) -> alternate
- AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
if (SDValue Op = BuildUDIV(N))
return Op;
@@ -2965,7 +3071,7 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
}
}
- AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
// If X/C can be simplified by the division-by-constant logic, lower
// X%C to the equivalent of X-X/C*C.
@@ -3003,19 +3109,26 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
EVT VT = N->getValueType(0);
SDLoc DL(N);
+ if (VT.isVector()) {
+ // fold (mulhs x, 0) -> 0
+ if (ISD::isBuildVectorAllZeros(N1.getNode()))
+ return N1;
+ if (ISD::isBuildVectorAllZeros(N0.getNode()))
+ return N0;
+ }
+
// fold (mulhs x, 0) -> 0
if (isNullConstant(N1))
return N1;
// fold (mulhs x, 1) -> (sra x, size(x)-1)
- if (isOneConstant(N1)) {
- SDLoc DL(N);
+ if (isOneConstant(N1))
return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
DAG.getConstant(N0.getValueSizeInBits() - 1, DL,
getShiftAmountTy(N0.getValueType())));
- }
+
// fold (mulhs x, undef) -> 0
if (N0.isUndef() || N1.isUndef())
- return DAG.getConstant(0, SDLoc(N), VT);
+ return DAG.getConstant(0, DL, VT);
// If the type twice as wide is legal, transform the mulhs to a wider multiply
// plus a shift.
@@ -3043,6 +3156,14 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
EVT VT = N->getValueType(0);
SDLoc DL(N);
+ if (VT.isVector()) {
+ // fold (mulhu x, 0) -> 0
+ if (ISD::isBuildVectorAllZeros(N1.getNode()))
+ return N1;
+ if (ISD::isBuildVectorAllZeros(N0.getNode()))
+ return N0;
+ }
+
// fold (mulhu x, 0) -> 0
if (isNullConstant(N1))
return N1;
@@ -3216,7 +3337,7 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
- // fold (add c1, c2) -> c1+c2
+ // fold operation with constant operands.
ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
if (N0C && N1C)
@@ -3599,22 +3720,20 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
}
bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
- EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,
- bool &NarrowLoad) {
- uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits();
-
- if (ActiveBits == 0 || !AndC->getAPIntValue().isMask(ActiveBits))
+ EVT LoadResultTy, EVT &ExtVT) {
+ if (!AndC->getAPIntValue().isMask())
return false;
+ unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();
+
ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
- LoadedVT = LoadN->getMemoryVT();
+ EVT LoadedVT = LoadN->getMemoryVT();
if (ExtVT == LoadedVT &&
(!LegalOperations ||
TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
// ZEXTLOAD will match without needing to change the size of the value being
// loaded.
- NarrowLoad = false;
return true;
}
@@ -3634,10 +3753,185 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
return false;
- NarrowLoad = true;
return true;
}
+bool DAGCombiner::isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType,
+ EVT &ExtVT, unsigned ShAmt) {
+ // Don't transform one with multiple uses, this would require adding a new
+ // load.
+ if (!SDValue(LoadN, 0).hasOneUse())
+ return false;
+
+ if (LegalOperations &&
+ !TLI.isLoadExtLegal(ExtType, LoadN->getValueType(0), ExtVT))
+ return false;
+
+ // Do not generate loads of non-round integer types since these can
+ // be expensive (and would be wrong if the type is not byte sized).
+ if (!ExtVT.isRound())
+ return false;
+
+ // Don't change the width of a volatile load.
+ if (LoadN->isVolatile())
+ return false;
+
+ // Verify that we are actually reducing a load width here.
+ if (LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits())
+ return false;
+
+ // For the transform to be legal, the load must produce only two values
+ // (the value loaded and the chain). Don't transform a pre-increment
+ // load, for example, which produces an extra value. Otherwise the
+ // transformation is not equivalent, and the downstream logic to replace
+ // uses gets things wrong.
+ if (LoadN->getNumValues() > 2)
+ return false;
+
+ // If the load that we're shrinking is an extload and we're not just
+ // discarding the extension we can't simply shrink the load. Bail.
+ // TODO: It would be possible to merge the extensions in some cases.
+ if (LoadN->getExtensionType() != ISD::NON_EXTLOAD &&
+ LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
+ return false;
+
+ if (!TLI.shouldReduceLoadWidth(LoadN, ExtType, ExtVT))
+ return false;
+
+ // It's not possible to generate a constant of extended or untyped type.
+ EVT PtrType = LoadN->getOperand(1).getValueType();
+ if (PtrType == MVT::Untyped || PtrType.isExtended())
+ return false;
+
+ return true;
+}
+
+bool DAGCombiner::SearchForAndLoads(SDNode *N,
+ SmallPtrSetImpl<LoadSDNode*> &Loads,
+ SmallPtrSetImpl<SDNode*> &NodesWithConsts,
+ ConstantSDNode *Mask,
+ SDNode *&NodeToMask) {
+ // Recursively search for the operands, looking for loads which can be
+ // narrowed.
+ for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) {
+ SDValue Op = N->getOperand(i);
+
+ if (Op.getValueType().isVector())
+ return false;
+
+ // Some constants may need fixing up later if they are too large.
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+ if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) &&
+ (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
+ NodesWithConsts.insert(N);
+ continue;
+ }
+
+ if (!Op.hasOneUse())
+ return false;
+
+ switch(Op.getOpcode()) {
+ case ISD::LOAD: {
+ auto *Load = cast<LoadSDNode>(Op);
+ EVT ExtVT;
+ if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
+ isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) {
+ // Only add this load if we can make it more narrow.
+ if (ExtVT.bitsLT(Load->getMemoryVT()))
+ Loads.insert(Load);
+ continue;
+ }
+ return false;
+ }
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::AssertZext: {
+ unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
+ EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+ EVT VT = Op.getOpcode() == ISD::AssertZext ?
+ cast<VTSDNode>(Op.getOperand(1))->getVT() :
+ Op.getOperand(0).getValueType();
+
+ // We can accept extending nodes if the mask is wider or an equal
+ // width to the original type.
+ if (ExtVT.bitsGE(VT))
+ continue;
+ break;
+ }
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::AND:
+ if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask,
+ NodeToMask))
+ return false;
+ continue;
+ }
+
+ // Allow one node which will masked along with any loads found.
+ if (NodeToMask)
+ return false;
+ NodeToMask = Op.getNode();
+ }
+ return true;
+}
+
+bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
+ auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!Mask)
+ return false;
+
+ if (!Mask->getAPIntValue().isMask())
+ return false;
+
+ // No need to do anything if the and directly uses a load.
+ if (isa<LoadSDNode>(N->getOperand(0)))
+ return false;
+
+ SmallPtrSet<LoadSDNode*, 8> Loads;
+ SmallPtrSet<SDNode*, 2> NodesWithConsts;
+ SDNode *FixupNode = nullptr;
+ if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) {
+ if (Loads.size() == 0)
+ return false;
+
+ SDValue MaskOp = N->getOperand(1);
+
+ // If it exists, fixup the single node we allow in the tree that needs
+ // masking.
+ if (FixupNode) {
+ SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
+ FixupNode->getValueType(0),
+ SDValue(FixupNode, 0), MaskOp);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
+ DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0),
+ MaskOp);
+ }
+
+ // Narrow any constants that need it.
+ for (auto *LogicN : NodesWithConsts) {
+ auto *C = cast<ConstantSDNode>(LogicN->getOperand(1));
+ SDValue And = DAG.getNode(ISD::AND, SDLoc(C), C->getValueType(0),
+ SDValue(C, 0), MaskOp);
+ DAG.UpdateNodeOperands(LogicN, LogicN->getOperand(0), And);
+ }
+
+ // Create narrow loads.
+ for (auto *Load : Loads) {
+ SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
+ SDValue(Load, 0), MaskOp);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
+ DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp);
+ SDValue NewLoad = ReduceLoadWidth(And.getNode());
+ assert(NewLoad &&
+ "Shouldn't be masking the load if it can't be narrowed");
+ CombineTo(Load, NewLoad, NewLoad.getValue(1));
+ }
+ DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode());
+ return true;
+ }
+ return false;
+}
+
SDValue DAGCombiner::visitAND(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -3829,55 +4123,23 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD ||
(N0.getOpcode() == ISD::ANY_EXTEND &&
N0.getOperand(0).getOpcode() == ISD::LOAD))) {
- bool HasAnyExt = N0.getOpcode() == ISD::ANY_EXTEND;
- LoadSDNode *LN0 = HasAnyExt
- ? cast<LoadSDNode>(N0.getOperand(0))
- : cast<LoadSDNode>(N0);
- if (LN0->getExtensionType() != ISD::SEXTLOAD &&
- LN0->isUnindexed() && N0.hasOneUse() && SDValue(LN0, 0).hasOneUse()) {
- auto NarrowLoad = false;
- EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
- EVT ExtVT, LoadedVT;
- if (isAndLoadExtLoad(N1C, LN0, LoadResultTy, ExtVT, LoadedVT,
- NarrowLoad)) {
- if (!NarrowLoad) {
- SDValue NewLoad =
- DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
- LN0->getChain(), LN0->getBasePtr(), ExtVT,
- LN0->getMemOperand());
- AddToWorklist(N);
- CombineTo(LN0, NewLoad, NewLoad.getValue(1));
- return SDValue(N, 0); // Return N so it doesn't get rechecked!
- } else {
- EVT PtrType = LN0->getOperand(1).getValueType();
-
- unsigned Alignment = LN0->getAlignment();
- SDValue NewPtr = LN0->getBasePtr();
-
- // For big endian targets, we need to add an offset to the pointer
- // to load the correct bytes. For little endian systems, we merely
- // need to read fewer bytes from the same pointer.
- if (DAG.getDataLayout().isBigEndian()) {
- unsigned LVTStoreBytes = LoadedVT.getStoreSize();
- unsigned EVTStoreBytes = ExtVT.getStoreSize();
- unsigned PtrOff = LVTStoreBytes - EVTStoreBytes;
- SDLoc DL(LN0);
- NewPtr = DAG.getNode(ISD::ADD, DL, PtrType,
- NewPtr, DAG.getConstant(PtrOff, DL, PtrType));
- Alignment = MinAlign(Alignment, PtrOff);
- }
+ if (SDValue Res = ReduceLoadWidth(N)) {
+ LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND
+ ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0);
- AddToWorklist(NewPtr.getNode());
+ AddToWorklist(N);
+ CombineTo(LN0, Res, Res.getValue(1));
+ return SDValue(N, 0);
+ }
+ }
- SDValue Load = DAG.getExtLoad(
- ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, LN0->getChain(), NewPtr,
- LN0->getPointerInfo(), ExtVT, Alignment,
- LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
- AddToWorklist(N);
- CombineTo(LN0, Load, Load.getValue(1));
- return SDValue(N, 0); // Return N so it doesn't get rechecked!
- }
- }
+ if (Level >= AfterLegalizeTypes) {
+ // Attempt to propagate the AND back up to the leaves which, if they're
+ // loads, can be combined to narrow loads and the AND node can be removed.
+ // Perform after legalization so that extend nodes will already be
+ // combined into the loads.
+ if (BackwardsPropagateMask(N, DAG)) {
+ return SDValue(N, 0);
}
}
@@ -3974,7 +4236,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
return SDValue();
- // Recognize (and (shl a, 8), 0xff), (and (srl a, 8), 0xff00)
+ // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff)
bool LookPassAnd0 = false;
bool LookPassAnd1 = false;
if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
@@ -4593,20 +4855,6 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
return nullptr;
}
-// if Left + Right == Sum (constant or constant splat vector)
-static bool sumMatchConstant(SDValue Left, SDValue Right, unsigned Sum,
- SelectionDAG &DAG, const SDLoc &DL) {
- EVT ShiftVT = Left.getValueType();
- if (ShiftVT != Right.getValueType()) return false;
-
- SDValue ShiftSum = DAG.FoldConstantArithmetic(ISD::ADD, DL, ShiftVT,
- Left.getNode(), Right.getNode());
- if (!ShiftSum) return false;
-
- ConstantSDNode *CSum = isConstOrConstSplat(ShiftSum);
- return CSum && CSum->getZExtValue() == Sum;
-}
-
// MatchRotate - Handle an 'or' of two operands. If this is one of the many
// idioms for rotate, and if the target supports rotation instructions, generate
// a rot[lr].
@@ -4620,6 +4868,16 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT);
if (!HasROTL && !HasROTR) return nullptr;
+ // Check for truncated rotate.
+ if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
+ LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
+ assert(LHS.getValueType() == RHS.getValueType());
+ if (SDNode *Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
+ return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(),
+ SDValue(Rot, 0)).getNode();
+ }
+ }
+
// Match "(X shl/srl V1) & V2" where V2 may not be present.
SDValue LHSShift; // The shift.
SDValue LHSMask; // AND value if any.
@@ -4652,7 +4910,11 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
// fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
// fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
- if (sumMatchConstant(LHSShiftAmt, RHSShiftAmt, EltSizeInBits, DAG, DL)) {
+ auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
+ ConstantSDNode *RHS) {
+ return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
+ };
+ if (matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
@@ -4712,20 +4974,22 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
}
namespace {
+
/// Represents known origin of an individual byte in load combine pattern. The
/// value of the byte is either constant zero or comes from memory.
struct ByteProvider {
// For constant zero providers Load is set to nullptr. For memory providers
// Load represents the node which loads the byte from memory.
// ByteOffset is the offset of the byte in the value produced by the load.
- LoadSDNode *Load;
- unsigned ByteOffset;
+ LoadSDNode *Load = nullptr;
+ unsigned ByteOffset = 0;
- ByteProvider() : Load(nullptr), ByteOffset(0) {}
+ ByteProvider() = default;
static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
return ByteProvider(Load, ByteOffset);
}
+
static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }
bool isConstantZero() const { return !Load; }
@@ -4740,6 +5004,8 @@ private:
: Load(Load), ByteOffset(ByteOffset) {}
};
+} // end anonymous namespace
+
/// Recursively traverses the expression calculating the origin of the requested
/// byte of the given value. Returns None if the provider can't be calculated.
///
@@ -4751,9 +5017,9 @@ private:
/// Because the parts of the expression are not allowed to have more than one
/// use this function iterates over trees, not DAGs. So it never visits the same
/// node more than once.
-const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index,
- unsigned Depth,
- bool Root = false) {
+static const Optional<ByteProvider>
+calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
+ bool Root = false) {
// Typical i64 by i8 pattern requires recursion up to 8 calls depth
if (Depth == 10)
return None;
@@ -4837,7 +5103,6 @@ const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index,
return None;
}
-} // namespace
/// Match a pattern where a wide type scalar value is loaded by several narrow
/// loads and combined by shifts and ors. Fold it into a single load or a load
@@ -4950,7 +5215,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
Loads.insert(L);
}
- assert(Loads.size() > 0 && "All the bytes of the value must be loaded from "
+ assert(!Loads.empty() && "All the bytes of the value must be loaded from "
"memory, so there must be at least one load which produces the value");
assert(Base && "Base address of the accessed memory location must be set");
assert(FirstOffset != INT64_MAX && "First byte offset must be set");
@@ -5373,7 +5638,11 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
if (isNullConstantOrNullSplatConstant(N0))
return N0;
// fold (shl x, c >= size(x)) -> undef
- if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
+ // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
+ auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
+ return Val->getAPIntValue().uge(OpSizeInBits);
+ };
+ if (matchUnaryPredicate(N1, MatchShiftTooBig))
return DAG.getUNDEF(VT);
// fold (shl x, 0) -> x
if (N1C && N1C->isNullValue())
@@ -5400,20 +5669,29 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
return SDValue(N, 0);
// fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
- if (N1C && N0.getOpcode() == ISD::SHL) {
- if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
- SDLoc DL(N);
- APInt c1 = N0C1->getAPIntValue();
- APInt c2 = N1C->getAPIntValue();
+ if (N0.getOpcode() == ISD::SHL) {
+ auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
+ ConstantSDNode *RHS) {
+ APInt c1 = LHS->getAPIntValue();
+ APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+ return (c1 + c2).uge(OpSizeInBits);
+ };
+ if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
+ return DAG.getConstant(0, SDLoc(N), VT);
- APInt Sum = c1 + c2;
- if (Sum.uge(OpSizeInBits))
- return DAG.getConstant(0, DL, VT);
-
- return DAG.getNode(
- ISD::SHL, DL, VT, N0.getOperand(0),
- DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType()));
+ auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
+ ConstantSDNode *RHS) {
+ APInt c1 = LHS->getAPIntValue();
+ APInt c2 = RHS->getAPIntValue();
+ zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+ return (c1 + c2).ult(OpSizeInBits);
+ };
+ if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
+ SDLoc DL(N);
+ EVT ShiftVT = N1.getValueType();
+ SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
+ return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);
}
}
@@ -5527,16 +5805,18 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
}
// fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+ // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
// Variant of version done on multiply, except mul by a power of 2 is turned
// into a shift.
- if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
+ if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) &&
+ N0.getNode()->hasOneUse() &&
isConstantOrConstantVector(N1, /* No Opaques */ true) &&
isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
AddToWorklist(Shl0.getNode());
AddToWorklist(Shl1.getNode());
- return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);
+ return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
}
// fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
@@ -5579,7 +5859,11 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
if (N0C && N1C && !N1C->isOpaque())
return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
// fold (sra x, c >= size(x)) -> undef
- if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
+ // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
+ auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
+ return Val->getAPIntValue().uge(OpSizeInBits);
+ };
+ if (matchUnaryPredicate(N1, MatchShiftTooBig))
return DAG.getUNDEF(VT);
// fold (sra x, 0) -> x
if (N1C && N1C->isNullValue())
@@ -5603,20 +5887,31 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
}
// fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
- if (N1C && N0.getOpcode() == ISD::SRA) {
- if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
- SDLoc DL(N);
- APInt c1 = N0C1->getAPIntValue();
- APInt c2 = N1C->getAPIntValue();
- zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
-
- APInt Sum = c1 + c2;
- if (Sum.uge(OpSizeInBits))
- Sum = APInt(OpSizeInBits, OpSizeInBits - 1);
+ if (N0.getOpcode() == ISD::SRA) {
+ SDLoc DL(N);
+ EVT ShiftVT = N1.getValueType();
- return DAG.getNode(
- ISD::SRA, DL, VT, N0.getOperand(0),
- DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType()));
+ auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
+ ConstantSDNode *RHS) {
+ APInt c1 = LHS->getAPIntValue();
+ APInt c2 = RHS->getAPIntValue();
+ zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+ return (c1 + c2).uge(OpSizeInBits);
+ };
+ if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
+ return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0),
+ DAG.getConstant(OpSizeInBits - 1, DL, ShiftVT));
+
+ auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
+ ConstantSDNode *RHS) {
+ APInt c1 = LHS->getAPIntValue();
+ APInt c2 = RHS->getAPIntValue();
+ zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+ return (c1 + c2).ult(OpSizeInBits);
+ };
+ if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
+ SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
+ return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), Sum);
}
}
@@ -5647,7 +5942,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
TLI.isTruncateFree(VT, TruncVT)) {
-
SDLoc DL(N);
SDValue Amt = DAG.getConstant(ShiftAmt, DL,
getShiftAmountTy(N0.getOperand(0).getValueType()));
@@ -5697,7 +5991,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
-
// If the sign bit is known to be zero, switch this to a SRL.
if (DAG.SignBitIsZero(N0))
return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
@@ -5730,7 +6023,11 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
if (isNullConstantOrNullSplatConstant(N0))
return N0;
// fold (srl x, c >= size(x)) -> undef
- if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
+ // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
+ auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
+ return Val->getAPIntValue().uge(OpSizeInBits);
+ };
+ if (matchUnaryPredicate(N1, MatchShiftTooBig))
return DAG.getUNDEF(VT);
// fold (srl x, 0) -> x
if (N1C && N1C->isNullValue())
@@ -5745,20 +6042,29 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
return DAG.getConstant(0, SDLoc(N), VT);
// fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
- if (N1C && N0.getOpcode() == ISD::SRL) {
- if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
- SDLoc DL(N);
- APInt c1 = N0C1->getAPIntValue();
- APInt c2 = N1C->getAPIntValue();
+ if (N0.getOpcode() == ISD::SRL) {
+ auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
+ ConstantSDNode *RHS) {
+ APInt c1 = LHS->getAPIntValue();
+ APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+ return (c1 + c2).uge(OpSizeInBits);
+ };
+ if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
+ return DAG.getConstant(0, SDLoc(N), VT);
- APInt Sum = c1 + c2;
- if (Sum.uge(OpSizeInBits))
- return DAG.getConstant(0, DL, VT);
-
- return DAG.getNode(
- ISD::SRL, DL, VT, N0.getOperand(0),
- DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType()));
+ auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
+ ConstantSDNode *RHS) {
+ APInt c1 = LHS->getAPIntValue();
+ APInt c2 = RHS->getAPIntValue();
+ zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+ return (c1 + c2).ult(OpSizeInBits);
+ };
+ if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
+ SDLoc DL(N);
+ EVT ShiftVT = N1.getValueType();
+ SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
+ return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
}
}
@@ -6008,7 +6314,6 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) {
return SDValue();
}
-
/// \brief Generate Min/Max node
static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
SDValue RHS, SDValue True, SDValue False,
@@ -6096,7 +6401,7 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
// For any constants that differ by 1, we can transform the select into an
// extend and add. Use a target hook because some targets may prefer to
// transform in the other direction.
- if (TLI.convertSelectOfConstantsToMath()) {
+ if (TLI.convertSelectOfConstantsToMath(VT)) {
if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {
// select Cond, C1, C1-1 --> add (zext Cond), C1-1
if (VT != MVT::i1)
@@ -6371,7 +6676,6 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
}
SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
-
if (Level >= AfterLegalizeTypes)
return SDValue();
@@ -6432,7 +6736,6 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
}
SDValue DAGCombiner::visitMSTORE(SDNode *N) {
-
if (Level >= AfterLegalizeTypes)
return SDValue();
@@ -6447,7 +6750,6 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
// prevents the type legalizer from unrolling SETCC into scalar comparisons
// and enables future optimizations (e.g. min/max pattern matching on X86).
if (Mask.getOpcode() == ISD::SETCC) {
-
// Check if any splitting is required.
if (TLI.getTypeAction(*DAG.getContext(), VT) !=
TargetLowering::TypeSplitVector)
@@ -6504,11 +6806,10 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
}
SDValue DAGCombiner::visitMGATHER(SDNode *N) {
-
if (Level >= AfterLegalizeTypes)
return SDValue();
- MaskedGatherSDNode *MGT = dyn_cast<MaskedGatherSDNode>(N);
+ MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
SDValue Mask = MGT->getMask();
SDLoc DL(N);
@@ -6581,7 +6882,6 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) {
}
SDValue DAGCombiner::visitMLOAD(SDNode *N) {
-
if (Level >= AfterLegalizeTypes)
return SDValue();
@@ -6593,7 +6893,6 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
// SETCC, then split both nodes and its operands before legalization. This
// prevents the type legalizer from unrolling SETCC into scalar comparisons
// and enables future optimizations (e.g. min/max pattern matching on X86).
-
if (Mask.getOpcode() == ISD::SETCC) {
EVT VT = N->getValueType(0);
@@ -6665,6 +6964,57 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
return SDValue();
}
+/// A vector select of 2 constant vectors can be simplified to math/logic to
+/// avoid a variable select instruction and possibly avoid constant loads.
+SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
+ SDValue Cond = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ EVT VT = N->getValueType(0);
+ if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 ||
+ !TLI.convertSelectOfConstantsToMath(VT) ||
+ !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) ||
+ !ISD::isBuildVectorOfConstantSDNodes(N2.getNode()))
+ return SDValue();
+
+ // Check if we can use the condition value to increment/decrement a single
+ // constant value. This simplifies a select to an add and removes a constant
+ // load/materialization from the general case.
+ bool AllAddOne = true;
+ bool AllSubOne = true;
+ unsigned Elts = VT.getVectorNumElements();
+ for (unsigned i = 0; i != Elts; ++i) {
+ SDValue N1Elt = N1.getOperand(i);
+ SDValue N2Elt = N2.getOperand(i);
+ if (N1Elt.isUndef() || N2Elt.isUndef())
+ continue;
+
+ const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
+ const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
+ if (C1 != C2 + 1)
+ AllAddOne = false;
+ if (C1 != C2 - 1)
+ AllSubOne = false;
+ }
+
+ // Further simplifications for the extra-special cases where the constants are
+ // all 0 or all -1 should be implemented as folds of these patterns.
+ SDLoc DL(N);
+ if (AllAddOne || AllSubOne) {
+ // vselect <N x i1> Cond, C+1, C --> add (zext Cond), C
+ // vselect <N x i1> Cond, C-1, C --> add (sext Cond), C
+ auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+ SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond);
+ return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
+ }
+
+ // The general case for select-of-constants:
+ // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
+ // ...but that only makes sense if a vselect is slower than 2 logic ops, so
+ // leave that to a machine-specific pass.
+ return SDValue();
+}
+
SDValue DAGCombiner::visitVSELECT(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -6729,6 +7079,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
return CV;
}
+ if (SDValue V = foldVSelectOfConstants(N))
+ return V;
+
return SDValue();
}
@@ -7243,8 +7596,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
SDLoc(N0.getOperand(0)),
N0.getOperand(0).getValueType(), ExtLoad);
ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
+ bool NoReplaceTruncAnd = !N0.hasOneUse();
bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
CombineTo(N, And);
+ // If N0 has multiple uses, change other uses as well.
+ if (NoReplaceTruncAnd) {
+ SDValue TruncAnd =
+ DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
+ CombineTo(N0.getNode(), TruncAnd);
+ }
if (NoReplaceTrunc)
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
else
@@ -7307,7 +7667,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
return SCC;
- if (!VT.isVector()) {
+ if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) {
EVT SetCCVT = getSetCCResultType(N00VT);
// Don't do this transform for i1 because there's a select transform
// that would reverse it.
@@ -7399,20 +7759,6 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
}
- // fold (zext (truncate (load x))) -> (zext (smaller load x))
- // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n)))
- if (N0.getOpcode() == ISD::TRUNCATE) {
- if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
- SDNode *oye = N0.getOperand(0).getNode();
- if (NarrowLoad.getNode() != N0.getNode()) {
- CombineTo(N0.getNode(), NarrowLoad);
- // CombineTo deleted the truncate, if needed, but not what's under it.
- AddToWorklist(oye);
- }
- return SDValue(N, 0); // Return N so it doesn't get rechecked!
- }
- }
-
// fold (zext (truncate x)) -> (and x, mask)
if (N0.getOpcode() == ISD::TRUNCATE) {
// fold (zext (truncate (load x))) -> (zext (smaller load x))
@@ -7445,7 +7791,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
AddToWorklist(Op.getNode());
- return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
+ SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
+ // We may safely transfer the debug info describing the truncate node over
+ // to the equivalent and operation.
+ DAG.transferDbgValues(N0, And);
+ return And;
}
}
@@ -7522,11 +7872,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
if (!N0.hasOneUse()) {
if (N0.getOpcode() == ISD::AND) {
auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
- auto NarrowLoad = false;
EVT LoadResultTy = AndC->getValueType(0);
- EVT ExtVT, LoadedVT;
- if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT, LoadedVT,
- NarrowLoad))
+ EVT ExtVT;
+ if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT))
DoXform = false;
}
if (DoXform)
@@ -7547,8 +7895,15 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
SDLoc(N0.getOperand(0)),
N0.getOperand(0).getValueType(), ExtLoad);
ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND);
+ bool NoReplaceTruncAnd = !N0.hasOneUse();
bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
CombineTo(N, And);
+ // If N0 has multiple uses, change other uses as well.
+ if (NoReplaceTruncAnd) {
+ SDValue TruncAnd =
+ DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
+ CombineTo(N0.getNode(), TruncAnd);
+ }
if (NoReplaceTrunc)
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
else
@@ -7604,10 +7959,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
// If the desired elements are smaller or larger than the source
// elements we can use a matching integer vector type and then
// truncate/sign extend.
- EVT MatchingElementType = EVT::getIntegerVT(
- *DAG.getContext(), N00VT.getScalarSizeInBits());
- EVT MatchingVectorType = EVT::getVectorVT(
- *DAG.getContext(), MatchingElementType, N00VT.getVectorNumElements());
+ EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
SDValue VsetCC =
DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
N0.getOperand(1), N0.getOperand(2));
@@ -7731,7 +8083,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
ISD::ANY_EXTEND);
// If the load value is used only by N, replace it via CombineTo N.
bool NoReplaceTrunc = N0.hasOneUse();
- CombineTo(N, ExtLoad);
+ CombineTo(N, ExtLoad);
if (NoReplaceTrunc)
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
else
@@ -7769,13 +8121,16 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
// aext(setcc) -> aext(vsetcc)
// Only do this before legalize for now.
if (VT.isVector() && !LegalOperations) {
- EVT N0VT = N0.getOperand(0).getValueType();
- // We know that the # elements of the results is the same as the
- // # elements of the compare (and the # elements of the compare result
- // for that matter). Check to see that they are the same size. If so,
- // we know that the element size of the sext'd result matches the
- // element size of the compare operands.
- if (VT.getSizeInBits() == N0VT.getSizeInBits())
+ EVT N00VT = N0.getOperand(0).getValueType();
+ if (getSetCCResultType(N00VT) == N0.getValueType())
+ return SDValue();
+
+ // We know that the # elements of the results is the same as the
+ // # elements of the compare (and the # elements of the compare result
+ // for that matter). Check to see that they are the same size. If so,
+ // we know that the element size of the sext'd result matches the
+ // element size of the compare operands.
+ if (VT.getSizeInBits() == N00VT.getSizeInBits())
return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
N0.getOperand(1),
cast<CondCodeSDNode>(N0.getOperand(2))->get());
@@ -7783,7 +8138,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
// elements we can use a matching integer vector type and then
// truncate/any extend
else {
- EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger();
+ EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
SDValue VsetCC =
DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
N0.getOperand(1),
@@ -7804,77 +8159,47 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
return SDValue();
}
-SDValue DAGCombiner::visitAssertZext(SDNode *N) {
+SDValue DAGCombiner::visitAssertExt(SDNode *N) {
+ unsigned Opcode = N->getOpcode();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- EVT EVT = cast<VTSDNode>(N1)->getVT();
+ EVT AssertVT = cast<VTSDNode>(N1)->getVT();
- // fold (assertzext (assertzext x, vt), vt) -> (assertzext x, vt)
- if (N0.getOpcode() == ISD::AssertZext &&
- EVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
+ // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt)
+ if (N0.getOpcode() == Opcode &&
+ AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
return N0;
- return SDValue();
-}
+ if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+ N0.getOperand(0).getOpcode() == Opcode) {
+ // We have an assert, truncate, assert sandwich. Make one stronger assert
+ // by asserting on the smallest asserted type to the larger source type.
+ // This eliminates the later assert:
+ // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN
+ // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN
+ SDValue BigA = N0.getOperand(0);
+ EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
+ assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
+ "Asserting zero/sign-extended bits to a type larger than the "
+ "truncated destination does not provide information");
-/// See if the specified operand can be simplified with the knowledge that only
-/// the bits specified by Mask are used. If so, return the simpler operand,
-/// otherwise return a null SDValue.
-///
-/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can
-/// simplify nodes with multiple uses more aggressively.)
-SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
- switch (V.getOpcode()) {
- default: break;
- case ISD::Constant: {
- const ConstantSDNode *CV = cast<ConstantSDNode>(V.getNode());
- assert(CV && "Const value should be ConstSDNode.");
- const APInt &CVal = CV->getAPIntValue();
- APInt NewVal = CVal & Mask;
- if (NewVal != CVal)
- return DAG.getConstant(NewVal, SDLoc(V), V.getValueType());
- break;
+ SDLoc DL(N);
+ EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT;
+ SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT);
+ SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
+ BigA.getOperand(0), MinAssertVTVal);
+ return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
}
- case ISD::OR:
- case ISD::XOR:
- // If the LHS or RHS don't contribute bits to the or, drop them.
- if (DAG.MaskedValueIsZero(V.getOperand(0), Mask))
- return V.getOperand(1);
- if (DAG.MaskedValueIsZero(V.getOperand(1), Mask))
- return V.getOperand(0);
- break;
- case ISD::SRL:
- // Only look at single-use SRLs.
- if (!V.getNode()->hasOneUse())
- break;
- if (ConstantSDNode *RHSC = getAsNonOpaqueConstant(V.getOperand(1))) {
- // See if we can recursively simplify the LHS.
- unsigned Amt = RHSC->getZExtValue();
- // Watch out for shift count overflow though.
- if (Amt >= Mask.getBitWidth()) break;
- APInt NewMask = Mask << Amt;
- if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask))
- return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(),
- SimplifyLHS, V.getOperand(1));
- }
- break;
- case ISD::AND: {
- // X & -1 -> X (ignoring bits which aren't demanded).
- ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1));
- if (AndVal && (AndVal->getAPIntValue() & Mask) == Mask)
- return V.getOperand(0);
- break;
- }
- }
return SDValue();
}
/// If the result of a wider load is shifted to right of N bits and then
/// truncated to a narrower type and where N is a multiple of number of bits of
/// the narrower type, transform it to a narrower load from address + N / num of
-/// bits of new type. If the result is to be extended, also fold the extension
-/// to form a extending load.
+/// bits of new type. Also narrow the load if the result is masked with an AND
+/// to effectively produce a smaller type. If the result is to be extended, also
+/// fold the extension to form a extending load.
SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
unsigned Opc = N->getOpcode();
@@ -7893,28 +8218,40 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
ExtType = ISD::SEXTLOAD;
ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
} else if (Opc == ISD::SRL) {
- // Another special-case: SRL is basically zero-extending a narrower value.
+ // Another special-case: SRL is basically zero-extending a narrower value,
+ // or it maybe shifting a higher subword, half or byte into the lowest
+ // bits.
ExtType = ISD::ZEXTLOAD;
N0 = SDValue(N, 0);
- ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
- if (!N01) return SDValue();
- ExtVT = EVT::getIntegerVT(*DAG.getContext(),
- VT.getSizeInBits() - N01->getZExtValue());
- }
- if (LegalOperations && !TLI.isLoadExtLegal(ExtType, VT, ExtVT))
- return SDValue();
- unsigned EVTBits = ExtVT.getSizeInBits();
+ auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0));
+ auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!N01 || !LN0)
+ return SDValue();
- // Do not generate loads of non-round integer types since these can
- // be expensive (and would be wrong if the type is not byte sized).
- if (!ExtVT.isRound())
- return SDValue();
+ uint64_t ShiftAmt = N01->getZExtValue();
+ uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits();
+ if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt)
+ ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt);
+ else
+ ExtVT = EVT::getIntegerVT(*DAG.getContext(),
+ VT.getSizeInBits() - ShiftAmt);
+ } else if (Opc == ISD::AND) {
+ // An AND with a constant mask is the same as a truncate + zero-extend.
+ auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!AndC || !AndC->getAPIntValue().isMask())
+ return SDValue();
+
+ unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();
+ ExtType = ISD::ZEXTLOAD;
+ ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+ }
unsigned ShAmt = 0;
if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
ShAmt = N01->getZExtValue();
+ unsigned EVTBits = ExtVT.getSizeInBits();
// Is the shift amount a multiple of size of VT?
if ((ShAmt & (EVTBits-1)) == 0) {
N0 = N0.getOperand(0);
@@ -7951,42 +8288,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
}
}
- // If we haven't found a load, we can't narrow it. Don't transform one with
- // multiple uses, this would require adding a new load.
- if (!isa<LoadSDNode>(N0) || !N0.hasOneUse())
+ // If we haven't found a load, we can't narrow it.
+ if (!isa<LoadSDNode>(N0))
return SDValue();
- // Don't change the width of a volatile load.
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
- if (LN0->isVolatile())
- return SDValue();
-
- // Verify that we are actually reducing a load width here.
- if (LN0->getMemoryVT().getSizeInBits() < EVTBits)
- return SDValue();
-
- // For the transform to be legal, the load must produce only two values
- // (the value loaded and the chain). Don't transform a pre-increment
- // load, for example, which produces an extra value. Otherwise the
- // transformation is not equivalent, and the downstream logic to replace
- // uses gets things wrong.
- if (LN0->getNumValues() > 2)
- return SDValue();
-
- // If the load that we're shrinking is an extload and we're not just
- // discarding the extension we can't simply shrink the load. Bail.
- // TODO: It would be possible to merge the extensions in some cases.
- if (LN0->getExtensionType() != ISD::NON_EXTLOAD &&
- LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
- return SDValue();
-
- if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT))
- return SDValue();
-
- EVT PtrType = N0.getOperand(1).getValueType();
-
- if (PtrType == MVT::Untyped || PtrType.isExtended())
- // It's not possible to generate a constant of extended or untyped type.
+ if (!isLegalNarrowLoad(LN0, ExtType, ExtVT, ShAmt))
return SDValue();
// For big endian targets, we need to adjust the offset to the pointer to
@@ -7997,6 +8304,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
ShAmt = LVTStoreBits - EVTStoreBits - ShAmt;
}
+ EVT PtrType = N0.getOperand(1).getValueType();
uint64_t PtrOff = ShAmt / 8;
unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
SDLoc DL(LN0);
@@ -8130,10 +8438,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
}
// fold (sext_inreg (extload x)) -> (sextload x)
+ // If sextload is not supported by target, we can only do the combine when
+ // load has one use. Doing otherwise can block folding the extload with other
+ // extends that the target does support.
if (ISD::isEXTLoad(N0.getNode()) &&
ISD::isUNINDEXEDLoad(N0.getNode()) &&
EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
- ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+ ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile() &&
+ N0.hasOneUse()) ||
TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
@@ -8208,12 +8520,18 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
// noop truncate
if (N0.getValueType() == N->getValueType(0))
return N0;
- // fold (truncate c1) -> c1
- if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
- return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
+
// fold (truncate (truncate x)) -> (truncate x)
if (N0.getOpcode() == ISD::TRUNCATE)
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
+
+ // fold (truncate c1) -> c1
+ if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
+ SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
+ if (C.getNode() != N)
+ return C;
+ }
+
// fold (truncate (ext x)) -> (ext x) or (truncate x) or x
if (N0.getOpcode() == ISD::ZERO_EXTEND ||
N0.getOpcode() == ISD::SIGN_EXTEND ||
@@ -8245,7 +8563,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
// we need to be more careful about the vector instructions that we generate.
if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
-
EVT VecTy = N0.getOperand(0).getValueType();
EVT ExTy = N0.getValueType();
EVT TrTy = N->getValueType(0);
@@ -8311,7 +8628,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
N0.getOperand(0).hasOneUse()) {
-
SDValue BuildVect = N0.getOperand(0);
EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
EVT TruncVecEltTy = VT.getVectorElementType();
@@ -8340,9 +8656,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
// Currently we only perform this optimization on scalars because vectors
// may have different active low bits.
if (!VT.isVector()) {
- if (SDValue Shorter =
- GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(),
- VT.getSizeInBits())))
+ APInt Mask =
+ APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits());
+ if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask))
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
}
@@ -8413,7 +8729,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
// Fold truncate of a bitcast of a vector to an extract of the low vector
// element.
//
- // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, 0
+ // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx
if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
SDValue VecSrc = N0.getOperand(0);
EVT SrcVT = VecSrc.getValueType();
@@ -8423,8 +8739,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
SDLoc SL(N);
EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+ unsigned Idx = isLE ? 0 : SrcVT.getVectorNumElements() - 1;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT,
- VecSrc, DAG.getConstant(0, SL, IdxVT));
+ VecSrc, DAG.getConstant(Idx, SL, IdxVT));
}
}
@@ -8466,11 +8783,18 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
+
+ // A BUILD_PAIR is always having the least significant part in elt 0 and the
+ // most significant part in elt 1. So when combining into one large load, we
+ // need to consider the endianness.
+ if (DAG.getDataLayout().isBigEndian())
+ std::swap(LD1, LD2);
+
if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() ||
LD1->getAddressSpace() != LD2->getAddressSpace())
return SDValue();
EVT LD1VT = LD1->getValueType(0);
- unsigned LD1Bytes = LD1VT.getSizeInBits() / 8;
+ unsigned LD1Bytes = LD1VT.getStoreSize();
if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&
DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {
unsigned Align = LD1->getAlignment();
@@ -8751,12 +9075,15 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
if (Op.getOpcode() == ISD::BITCAST &&
Op.getOperand(0).getValueType() == VT)
return SDValue(Op.getOperand(0));
- if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
+ if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
return DAG.getBitcast(VT, Op);
return SDValue();
};
+ // FIXME: If either input vector is bitcast, try to convert the shuffle to
+ // the result type of this bitcast. This would eliminate at least one
+ // bitcast. See the transform in InstCombine.
SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));
SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));
if (!(SV0 && SV1))
@@ -8949,7 +9276,6 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
// Always prefer FMAD to FMA for precision.
unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
- bool LookThroughFPExt = TLI.isFPExtFree(VT);
// Is the node an FMUL and contractable either due to global flags or
// SDNodeFlags.
@@ -8979,28 +9305,31 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
}
// Look through FP_EXTEND nodes to do more combining.
- if (LookThroughFPExt) {
- // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
- if (N0.getOpcode() == ISD::FP_EXTEND) {
- SDValue N00 = N0.getOperand(0);
- if (isContractableFMUL(N00))
- return DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N00.getOperand(0)),
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N00.getOperand(1)), N1);
+
+ // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
+ if (N0.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N00 = N0.getOperand(0);
+ if (isContractableFMUL(N00) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
+ return DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N00.getOperand(0)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N00.getOperand(1)), N1);
}
+ }
- // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
- // Note: Commutes FADD operands.
- if (N1.getOpcode() == ISD::FP_EXTEND) {
- SDValue N10 = N1.getOperand(0);
- if (isContractableFMUL(N10))
- return DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N10.getOperand(0)),
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N10.getOperand(1)), N0);
+ // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
+ // Note: Commutes FADD operands.
+ if (N1.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N10 = N1.getOperand(0);
+ if (isContractableFMUL(N10) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
+ return DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N10.getOperand(0)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N10.getOperand(1)), N0);
}
}
@@ -9036,80 +9365,87 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
N0));
}
- if (LookThroughFPExt) {
- // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
- // -> (fma x, y, (fma (fpext u), (fpext v), z))
- auto FoldFAddFMAFPExtFMul = [&] (
- SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
- return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
- DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
- DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
- Z));
- };
- if (N0.getOpcode() == PreferredFusedOpcode) {
- SDValue N02 = N0.getOperand(2);
- if (N02.getOpcode() == ISD::FP_EXTEND) {
- SDValue N020 = N02.getOperand(0);
- if (isContractableFMUL(N020))
- return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
- N020.getOperand(0), N020.getOperand(1),
- N1);
+
+ // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
+ // -> (fma x, y, (fma (fpext u), (fpext v), z))
+ auto FoldFAddFMAFPExtFMul = [&] (
+ SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
+ return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
+ DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
+ Z));
+ };
+ if (N0.getOpcode() == PreferredFusedOpcode) {
+ SDValue N02 = N0.getOperand(2);
+ if (N02.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N020 = N02.getOperand(0);
+ if (isContractableFMUL(N020) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) {
+ return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
+ N020.getOperand(0), N020.getOperand(1),
+ N1);
}
}
+ }
- // fold (fadd (fpext (fma x, y, (fmul u, v))), z)
- // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
- // FIXME: This turns two single-precision and one double-precision
- // operation into two double-precision operations, which might not be
- // interesting for all targets, especially GPUs.
- auto FoldFAddFPExtFMAFMul = [&] (
- SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
- return DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
- DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
- DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
- DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
- Z));
- };
- if (N0.getOpcode() == ISD::FP_EXTEND) {
- SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == PreferredFusedOpcode) {
- SDValue N002 = N00.getOperand(2);
- if (isContractableFMUL(N002))
- return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
- N002.getOperand(0), N002.getOperand(1),
- N1);
+ // fold (fadd (fpext (fma x, y, (fmul u, v))), z)
+ // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
+ // FIXME: This turns two single-precision and one double-precision
+ // operation into two double-precision operations, which might not be
+ // interesting for all targets, especially GPUs.
+ auto FoldFAddFPExtFMAFMul = [&] (
+ SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
+ return DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
+ DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
+ Z));
+ };
+ if (N0.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == PreferredFusedOpcode) {
+ SDValue N002 = N00.getOperand(2);
+ if (isContractableFMUL(N002) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
+ return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
+ N002.getOperand(0), N002.getOperand(1),
+ N1);
}
}
+ }
- // fold (fadd x, (fma y, z, (fpext (fmul u, v)))
- // -> (fma y, z, (fma (fpext u), (fpext v), x))
- if (N1.getOpcode() == PreferredFusedOpcode) {
- SDValue N12 = N1.getOperand(2);
- if (N12.getOpcode() == ISD::FP_EXTEND) {
- SDValue N120 = N12.getOperand(0);
- if (isContractableFMUL(N120))
- return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
- N120.getOperand(0), N120.getOperand(1),
- N0);
+ // fold (fadd x, (fma y, z, (fpext (fmul u, v)))
+ // -> (fma y, z, (fma (fpext u), (fpext v), x))
+ if (N1.getOpcode() == PreferredFusedOpcode) {
+ SDValue N12 = N1.getOperand(2);
+ if (N12.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N120 = N12.getOperand(0);
+ if (isContractableFMUL(N120) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) {
+ return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
+ N120.getOperand(0), N120.getOperand(1),
+ N0);
}
}
+ }
- // fold (fadd x, (fpext (fma y, z, (fmul u, v)))
- // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
- // FIXME: This turns two single-precision and one double-precision
- // operation into two double-precision operations, which might not be
- // interesting for all targets, especially GPUs.
- if (N1.getOpcode() == ISD::FP_EXTEND) {
- SDValue N10 = N1.getOperand(0);
- if (N10.getOpcode() == PreferredFusedOpcode) {
- SDValue N102 = N10.getOperand(2);
- if (isContractableFMUL(N102))
- return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
- N102.getOperand(0), N102.getOperand(1),
- N0);
+ // fold (fadd x, (fpext (fma y, z, (fmul u, v)))
+ // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
+ // FIXME: This turns two single-precision and one double-precision
+ // operation into two double-precision operations, which might not be
+ // interesting for all targets, especially GPUs.
+ if (N1.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N10 = N1.getOperand(0);
+ if (N10.getOpcode() == PreferredFusedOpcode) {
+ SDValue N102 = N10.getOperand(2);
+ if (isContractableFMUL(N102) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
+ return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
+ N102.getOperand(0), N102.getOperand(1),
+ N0);
}
}
}
@@ -9151,7 +9487,6 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// Always prefer FMAD to FMA for precision.
unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
- bool LookThroughFPExt = TLI.isFPExtFree(VT);
// Is the node an FMUL and contractable either due to global flags or
// SDNodeFlags.
@@ -9187,79 +9522,83 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
}
// Look through FP_EXTEND nodes to do more combining.
- if (LookThroughFPExt) {
- // fold (fsub (fpext (fmul x, y)), z)
- // -> (fma (fpext x), (fpext y), (fneg z))
- if (N0.getOpcode() == ISD::FP_EXTEND) {
- SDValue N00 = N0.getOperand(0);
- if (isContractableFMUL(N00))
- return DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N00.getOperand(0)),
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N00.getOperand(1)),
- DAG.getNode(ISD::FNEG, SL, VT, N1));
+
+ // fold (fsub (fpext (fmul x, y)), z)
+ // -> (fma (fpext x), (fpext y), (fneg z))
+ if (N0.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N00 = N0.getOperand(0);
+ if (isContractableFMUL(N00) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
+ return DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N00.getOperand(0)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N00.getOperand(1)),
+ DAG.getNode(ISD::FNEG, SL, VT, N1));
}
+ }
- // fold (fsub x, (fpext (fmul y, z)))
- // -> (fma (fneg (fpext y)), (fpext z), x)
- // Note: Commutes FSUB operands.
- if (N1.getOpcode() == ISD::FP_EXTEND) {
- SDValue N10 = N1.getOperand(0);
- if (isContractableFMUL(N10))
- return DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FNEG, SL, VT,
+ // fold (fsub x, (fpext (fmul y, z)))
+ // -> (fma (fneg (fpext y)), (fpext z), x)
+ // Note: Commutes FSUB operands.
+ if (N1.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N10 = N1.getOperand(0);
+ if (isContractableFMUL(N10) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
+ return DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FNEG, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N10.getOperand(0))),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N10.getOperand(1)),
+ N0);
+ }
+ }
+
+ // fold (fsub (fpext (fneg (fmul, x, y))), z)
+ // -> (fneg (fma (fpext x), (fpext y), z))
+ // Note: This could be removed with appropriate canonicalization of the
+ // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
+ // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+ // from implementing the canonicalization in visitFSUB.
+ if (N0.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == ISD::FNEG) {
+ SDValue N000 = N00.getOperand(0);
+ if (isContractableFMUL(N000) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
+ return DAG.getNode(ISD::FNEG, SL, VT,
+ DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N10.getOperand(0))),
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N10.getOperand(1)),
- N0);
- }
-
- // fold (fsub (fpext (fneg (fmul, x, y))), z)
- // -> (fneg (fma (fpext x), (fpext y), z))
- // Note: This could be removed with appropriate canonicalization of the
- // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
- // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
- // from implementing the canonicalization in visitFSUB.
- if (N0.getOpcode() == ISD::FP_EXTEND) {
- SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == ISD::FNEG) {
- SDValue N000 = N00.getOperand(0);
- if (isContractableFMUL(N000)) {
- return DAG.getNode(ISD::FNEG, SL, VT,
- DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N000.getOperand(0)),
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N000.getOperand(1)),
- N1));
- }
+ N000.getOperand(0)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N000.getOperand(1)),
+ N1));
}
}
+ }
- // fold (fsub (fneg (fpext (fmul, x, y))), z)
- // -> (fneg (fma (fpext x)), (fpext y), z)
- // Note: This could be removed with appropriate canonicalization of the
- // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
- // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
- // from implementing the canonicalization in visitFSUB.
- if (N0.getOpcode() == ISD::FNEG) {
- SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == ISD::FP_EXTEND) {
- SDValue N000 = N00.getOperand(0);
- if (isContractableFMUL(N000)) {
- return DAG.getNode(ISD::FNEG, SL, VT,
- DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N000.getOperand(0)),
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N000.getOperand(1)),
- N1));
- }
+ // fold (fsub (fneg (fpext (fmul, x, y))), z)
+ // -> (fneg (fma (fpext x)), (fpext y), z)
+ // Note: This could be removed with appropriate canonicalization of the
+ // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
+ // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+ // from implementing the canonicalization in visitFSUB.
+ if (N0.getOpcode() == ISD::FNEG) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N000 = N00.getOperand(0);
+ if (isContractableFMUL(N000) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N000.getValueType())) {
+ return DAG.getNode(ISD::FNEG, SL, VT,
+ DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N000.getOperand(0)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N000.getOperand(1)),
+ N1));
}
}
-
}
// More folding opportunities when target permits.
@@ -9298,102 +9637,108 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
N21, N0));
}
- if (LookThroughFPExt) {
- // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
- // -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
- if (N0.getOpcode() == PreferredFusedOpcode) {
- SDValue N02 = N0.getOperand(2);
- if (N02.getOpcode() == ISD::FP_EXTEND) {
- SDValue N020 = N02.getOperand(0);
- if (isContractableFMUL(N020))
- return DAG.getNode(PreferredFusedOpcode, SL, VT,
- N0.getOperand(0), N0.getOperand(1),
- DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N020.getOperand(0)),
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N020.getOperand(1)),
- DAG.getNode(ISD::FNEG, SL, VT,
- N1)));
- }
- }
- // fold (fsub (fpext (fma x, y, (fmul u, v))), z)
- // -> (fma (fpext x), (fpext y),
- // (fma (fpext u), (fpext v), (fneg z)))
- // FIXME: This turns two single-precision and one double-precision
- // operation into two double-precision operations, which might not be
- // interesting for all targets, especially GPUs.
- if (N0.getOpcode() == ISD::FP_EXTEND) {
- SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == PreferredFusedOpcode) {
- SDValue N002 = N00.getOperand(2);
- if (isContractableFMUL(N002))
- return DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N00.getOperand(0)),
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N00.getOperand(1)),
- DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N002.getOperand(0)),
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N002.getOperand(1)),
- DAG.getNode(ISD::FNEG, SL, VT,
- N1)));
- }
- }
-
- // fold (fsub x, (fma y, z, (fpext (fmul u, v))))
- // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
- if (N1.getOpcode() == PreferredFusedOpcode &&
- N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) {
- SDValue N120 = N1.getOperand(2).getOperand(0);
- if (isContractableFMUL(N120)) {
- SDValue N1200 = N120.getOperand(0);
- SDValue N1201 = N120.getOperand(1);
+ // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
+ // -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
+ if (N0.getOpcode() == PreferredFusedOpcode) {
+ SDValue N02 = N0.getOperand(2);
+ if (N02.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N020 = N02.getOperand(0);
+ if (isContractableFMUL(N020) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
- N1.getOperand(1),
+ N0.getOperand(0), N0.getOperand(1),
DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FNEG, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL,
- VT, N1200)),
DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N1201),
- N0));
+ N020.getOperand(0)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N020.getOperand(1)),
+ DAG.getNode(ISD::FNEG, SL, VT,
+ N1)));
}
}
+ }
- // fold (fsub x, (fpext (fma y, z, (fmul u, v))))
- // -> (fma (fneg (fpext y)), (fpext z),
- // (fma (fneg (fpext u)), (fpext v), x))
- // FIXME: This turns two single-precision and one double-precision
- // operation into two double-precision operations, which might not be
- // interesting for all targets, especially GPUs.
- if (N1.getOpcode() == ISD::FP_EXTEND &&
- N1.getOperand(0).getOpcode() == PreferredFusedOpcode) {
- SDValue N100 = N1.getOperand(0).getOperand(0);
- SDValue N101 = N1.getOperand(0).getOperand(1);
- SDValue N102 = N1.getOperand(0).getOperand(2);
- if (isContractableFMUL(N102)) {
- SDValue N1020 = N102.getOperand(0);
- SDValue N1021 = N102.getOperand(1);
+ // fold (fsub (fpext (fma x, y, (fmul u, v))), z)
+ // -> (fma (fpext x), (fpext y),
+ // (fma (fpext u), (fpext v), (fneg z)))
+ // FIXME: This turns two single-precision and one double-precision
+ // operation into two double-precision operations, which might not be
+ // interesting for all targets, especially GPUs.
+ if (N0.getOpcode() == ISD::FP_EXTEND) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == PreferredFusedOpcode) {
+ SDValue N002 = N00.getOperand(2);
+ if (isContractableFMUL(N002) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FNEG, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N100)),
- DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N00.getOperand(0)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N00.getOperand(1)),
DAG.getNode(PreferredFusedOpcode, SL, VT,
- DAG.getNode(ISD::FNEG, SL, VT,
- DAG.getNode(ISD::FP_EXTEND, SL,
- VT, N1020)),
DAG.getNode(ISD::FP_EXTEND, SL, VT,
- N1021),
- N0));
+ N002.getOperand(0)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N002.getOperand(1)),
+ DAG.getNode(ISD::FNEG, SL, VT,
+ N1)));
}
}
}
+
+ // fold (fsub x, (fma y, z, (fpext (fmul u, v))))
+ // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
+ if (N1.getOpcode() == PreferredFusedOpcode &&
+ N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) {
+ SDValue N120 = N1.getOperand(2).getOperand(0);
+ if (isContractableFMUL(N120) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) {
+ SDValue N1200 = N120.getOperand(0);
+ SDValue N1201 = N120.getOperand(1);
+ return DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
+ N1.getOperand(1),
+ DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FNEG, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL,
+ VT, N1200)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N1201),
+ N0));
+ }
+ }
+
+ // fold (fsub x, (fpext (fma y, z, (fmul u, v))))
+ // -> (fma (fneg (fpext y)), (fpext z),
+ // (fma (fneg (fpext u)), (fpext v), x))
+ // FIXME: This turns two single-precision and one double-precision
+ // operation into two double-precision operations, which might not be
+ // interesting for all targets, especially GPUs.
+ if (N1.getOpcode() == ISD::FP_EXTEND &&
+ N1.getOperand(0).getOpcode() == PreferredFusedOpcode) {
+ SDValue CvtSrc = N1.getOperand(0);
+ SDValue N100 = CvtSrc.getOperand(0);
+ SDValue N101 = CvtSrc.getOperand(1);
+ SDValue N102 = CvtSrc.getOperand(2);
+ if (isContractableFMUL(N102) &&
+ TLI.isFPExtFoldable(PreferredFusedOpcode, VT, CvtSrc.getValueType())) {
+ SDValue N1020 = N102.getOperand(0);
+ SDValue N1021 = N102.getOperand(1);
+ return DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FNEG, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N100)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
+ DAG.getNode(PreferredFusedOpcode, SL, VT,
+ DAG.getNode(ISD::FNEG, SL, VT,
+ DAG.getNode(ISD::FP_EXTEND, SL,
+ VT, N1020)),
+ DAG.getNode(ISD::FP_EXTEND, SL, VT,
+ N1021),
+ N0));
+ }
+ }
}
return SDValue();
@@ -9959,6 +10304,14 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
// TODO: The FMA node should have flags that propagate to this node.
return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
}
+
+ // fma (fneg x), K, y -> fma x -K, y
+ if (N0.getOpcode() == ISD::FNEG &&
+ (TLI.isOperationLegal(ISD::ConstantFP, VT) ||
+ (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) {
+ return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
+ DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2);
+ }
}
if (Options.UnsafeFPMath) {
@@ -10081,8 +10434,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
(!LegalOperations ||
// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
// backend)... we should handle this gracefully after Legalize.
- // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) ||
- TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) ||
+ // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
+ TLI.isOperationLegal(ISD::ConstantFP, VT) ||
TLI.isFPImmLegal(Recip, VT)))
return DAG.getNode(ISD::FMUL, DL, VT, N0,
DAG.getConstantFP(Recip, DL, VT), Flags);
@@ -10264,7 +10617,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
// ...but only if the target supports immediate floating-point values
(!LegalOperations ||
- TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
+ TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
// If the input is a legal type, and SINT_TO_FP is not legal on this target,
@@ -10282,7 +10635,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
!VT.isVector() &&
(!LegalOperations ||
- TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+ TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
SDLoc DL(N);
SDValue Ops[] =
{ N0.getOperand(0), N0.getOperand(1),
@@ -10296,7 +10649,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
if (N0.getOpcode() == ISD::ZERO_EXTEND &&
N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&
(!LegalOperations ||
- TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+ TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
SDLoc DL(N);
SDValue Ops[] =
{ N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
@@ -10318,7 +10671,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
// ...but only if the target supports immediate floating-point values
(!LegalOperations ||
- TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
+ TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
// If the input is a legal type, and UINT_TO_FP is not legal on this target,
@@ -10333,10 +10686,9 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
// The next optimizations are desirable only if SELECT_CC can be lowered.
if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
// fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
-
if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
(!LegalOperations ||
- TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+ TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
SDLoc DL(N);
SDValue Ops[] =
{ N0.getOperand(0), N0.getOperand(1),
@@ -10557,6 +10909,19 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
if (isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
+ // fold ftrunc (known rounded int x) -> x
+ // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is
+ // likely to be generated to extract integer from a rounded floating value.
+ switch (N0.getOpcode()) {
+ default: break;
+ case ISD::FRINT:
+ case ISD::FTRUNC:
+ case ISD::FNEARBYINT:
+ case ISD::FFLOOR:
+ case ISD::FCEIL:
+ return N0;
+ }
+
return SDValue();
}
@@ -11160,6 +11525,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
// Replace the uses of Ptr with uses of the updated base value.
DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
deleteAndRecombine(Ptr.getNode());
+ AddToWorklist(Result.getNode());
return true;
}
@@ -11445,6 +11811,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
}
namespace {
+
/// \brief Helper structure used to slice a load in smaller loads.
/// Basically a slice is obtained from the following sequence:
/// Origin = load Ty1, Base
@@ -11462,21 +11829,19 @@ struct LoadedSlice {
struct Cost {
/// Are we optimizing for code size.
bool ForCodeSize;
+
/// Various cost.
- unsigned Loads;
- unsigned Truncates;
- unsigned CrossRegisterBanksCopies;
- unsigned ZExts;
- unsigned Shift;
+ unsigned Loads = 0;
+ unsigned Truncates = 0;
+ unsigned CrossRegisterBanksCopies = 0;
+ unsigned ZExts = 0;
+ unsigned Shift = 0;
- Cost(bool ForCodeSize = false)
- : ForCodeSize(ForCodeSize), Loads(0), Truncates(0),
- CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {}
+ Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {}
/// \brief Get the cost of one isolated slice.
Cost(const LoadedSlice &LS, bool ForCodeSize = false)
- : ForCodeSize(ForCodeSize), Loads(1), Truncates(0),
- CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {
+ : ForCodeSize(ForCodeSize), Loads(1) {
EVT TruncType = LS.Inst->getValueType(0);
EVT LoadedType = LS.getLoadedType();
if (TruncType != LoadedType &&
@@ -11538,13 +11903,17 @@ struct LoadedSlice {
bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
};
+
// The last instruction that represent the slice. This should be a
// truncate instruction.
SDNode *Inst;
+
// The original load instruction.
LoadSDNode *Origin;
+
// The right shift amount in bits from the original load.
unsigned Shift;
+
// The DAG from which Origin came from.
// This is used to get some contextual information about legal types, etc.
SelectionDAG *DAG;
@@ -11746,7 +12115,8 @@ struct LoadedSlice {
return true;
}
};
-}
+
+} // end anonymous namespace
/// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
/// \p UsedBits looks like 0..0 1..1 0..0.
@@ -11804,7 +12174,6 @@ static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
// Set the beginning of the pair.
First = Second) {
-
Second = &LoadedSlices[CurrSlice];
// If First is NULL, it means we start a new pair.
@@ -11935,7 +12304,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
// will be across several bytes. We do not support that.
unsigned Width = User->getValueSizeInBits(0);
if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
- return 0;
+ return false;
// Build the slice for this chain of computations.
LoadedSlice LS(User, LD, Shift, &DAG);
@@ -12060,7 +12429,6 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
return Result;
}
-
/// Check to see if IVal is something that provides a value as specified by
/// MaskInfo. If so, replace the specified store with a narrower store of
/// truncated IVal.
@@ -12121,7 +12489,6 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
.getNode();
}
-
/// Look for sequence of load / op / store where op is one of 'or', 'xor', and
/// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
/// narrowing the load and store if it would end up being a win for performance
@@ -12325,7 +12692,6 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
// Walk all the users of the constant with which we're multiplying.
for (SDNode *Use : ConstNode->uses()) {
-
if (Use == MulNode) // This use is the one we're on right now. Skip it.
continue;
@@ -12376,6 +12742,12 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
return false;
}
+static SDValue peekThroughBitcast(SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+ return V;
+}
+
SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
unsigned NumStores) {
SmallVector<SDValue, 8> Chains;
@@ -12403,56 +12775,93 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
if (NumStores < 2)
return false;
- int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
-
// The latest Node in the DAG.
SDLoc DL(StoreNodes[0].MemNode);
- SDValue StoredVal;
+ int64_t ElementSizeBits = MemVT.getStoreSizeInBits();
+ unsigned SizeInBits = NumStores * ElementSizeBits;
+ unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
+
+ EVT StoreTy;
if (UseVector) {
- bool IsVec = MemVT.isVector();
- unsigned Elts = NumStores;
- if (IsVec) {
- // When merging vector stores, get the total number of elements.
- Elts *= MemVT.getVectorNumElements();
- }
+ unsigned Elts = NumStores * NumMemElts;
// Get the type for the merged vector store.
- EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
- assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
+ StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
+ } else
+ StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+ SDValue StoredVal;
+ if (UseVector) {
if (IsConstantSrc) {
SmallVector<SDValue, 8> BuildVector;
- for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
+ for (unsigned I = 0; I != NumStores; ++I) {
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
SDValue Val = St->getValue();
- if (MemVT.getScalarType().isInteger())
- if (auto *CFP = dyn_cast<ConstantFPSDNode>(St->getValue()))
- Val = DAG.getConstant(
- (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(),
- SDLoc(CFP), MemVT);
+ // If constant is of the wrong type, convert it now.
+ if (MemVT != Val.getValueType()) {
+ Val = peekThroughBitcast(Val);
+ // Deal with constants of wrong size.
+ if (ElementSizeBits != Val.getValueSizeInBits()) {
+ EVT IntMemVT =
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+ if (isa<ConstantFPSDNode>(Val)) {
+ // Not clear how to truncate FP values.
+ return false;
+ } else if (auto *C = dyn_cast<ConstantSDNode>(Val))
+ Val = DAG.getConstant(C->getAPIntValue()
+ .zextOrTrunc(Val.getValueSizeInBits())
+ .zextOrTrunc(ElementSizeBits),
+ SDLoc(C), IntMemVT);
+ }
+ // Make sure correctly size type is the correct type.
+ Val = DAG.getBitcast(MemVT, Val);
+ }
BuildVector.push_back(Val);
}
- StoredVal = DAG.getBuildVector(Ty, DL, BuildVector);
+ StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
+ : ISD::BUILD_VECTOR,
+ DL, StoreTy, BuildVector);
} else {
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i < NumStores; ++i) {
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
- SDValue Val = St->getValue();
- // All operands of BUILD_VECTOR / CONCAT_VECTOR must have the same type.
- if (Val.getValueType() != MemVT)
- return false;
+ SDValue Val = peekThroughBitcast(St->getValue());
+ // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of
+ // type MemVT. If the underlying value is not the correct
+ // type, but it is an extraction of an appropriate vector we
+ // can recast Val to be of the correct type. This may require
+ // converting between EXTRACT_VECTOR_ELT and
+ // EXTRACT_SUBVECTOR.
+ if ((MemVT != Val.getValueType()) &&
+ (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) {
+ SDValue Vec = Val.getOperand(0);
+ EVT MemVTScalarTy = MemVT.getScalarType();
+ // We may need to add a bitcast here to get types to line up.
+ if (MemVTScalarTy != Vec.getValueType()) {
+ unsigned Elts = Vec.getValueType().getSizeInBits() /
+ MemVTScalarTy.getSizeInBits();
+ EVT NewVecTy =
+ EVT::getVectorVT(*DAG.getContext(), MemVTScalarTy, Elts);
+ Vec = DAG.getBitcast(NewVecTy, Vec);
+ }
+ auto OpC = (MemVT.isVector()) ? ISD::EXTRACT_SUBVECTOR
+ : ISD::EXTRACT_VECTOR_ELT;
+ Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Val.getOperand(1));
+ }
Ops.push_back(Val);
}
// Build the extracted vector elements back into a vector.
- StoredVal = DAG.getNode(IsVec ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR,
- DL, Ty, Ops); }
+ StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
+ : ISD::BUILD_VECTOR,
+ DL, StoreTy, Ops);
+ }
} else {
// We should always use a vector store when merging extracted vector
// elements, so this path implies a store of constants.
assert(IsConstantSrc && "Merged vector elements should use vector store");
- unsigned SizeInBits = NumStores * ElementSizeBytes * 8;
APInt StoreInt(SizeInBits, 0);
// Construct a single integer constant which is made of the smaller
@@ -12463,18 +12872,25 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
SDValue Val = St->getValue();
- StoreInt <<= ElementSizeBytes * 8;
+ StoreInt <<= ElementSizeBits;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
- StoreInt |= C->getAPIntValue().zextOrTrunc(SizeInBits);
+ StoreInt |= C->getAPIntValue()
+ .zextOrTrunc(ElementSizeBits)
+ .zextOrTrunc(SizeInBits);
} else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
- StoreInt |= C->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
+ StoreInt |= C->getValueAPF()
+ .bitcastToAPInt()
+ .zextOrTrunc(ElementSizeBits)
+ .zextOrTrunc(SizeInBits);
+ // If fp truncation is necessary give up for now.
+ if (MemVT.getSizeInBits() != ElementSizeBits)
+ return false;
} else {
llvm_unreachable("Invalid constant element type");
}
}
// Create the new Load and Store operations.
- EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
}
@@ -12483,7 +12899,7 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
// make sure we use trunc store if it's necessary to be legal.
SDValue NewStore;
- if (UseVector || !UseTrunc) {
+ if (!UseTrunc) {
NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
FirstInChain->getPointerInfo(),
FirstInChain->getAlignment());
@@ -12517,6 +12933,7 @@ void DAGCombiner::getStoreMergeCandidates(
BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
EVT MemVT = St->getMemoryVT();
+ SDValue Val = peekThroughBitcast(St->getValue());
// We must have a base and an offset.
if (!BasePtr.getBase().getNode())
return;
@@ -12525,47 +12942,62 @@ void DAGCombiner::getStoreMergeCandidates(
if (BasePtr.getBase().isUndef())
return;
- bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) ||
- isa<ConstantFPSDNode>(St->getValue());
- bool IsExtractVecSrc =
- (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
- St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR);
- bool IsLoadSrc = isa<LoadSDNode>(St->getValue());
+ bool IsConstantSrc = isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val);
+ bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ Val.getOpcode() == ISD::EXTRACT_SUBVECTOR);
+ bool IsLoadSrc = isa<LoadSDNode>(Val);
BaseIndexOffset LBasePtr;
// Match on loadbaseptr if relevant.
- if (IsLoadSrc)
- LBasePtr = BaseIndexOffset::match(
- cast<LoadSDNode>(St->getValue())->getBasePtr(), DAG);
-
+ EVT LoadVT;
+ if (IsLoadSrc) {
+ auto *Ld = cast<LoadSDNode>(Val);
+ LBasePtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
+ LoadVT = Ld->getMemoryVT();
+ // Load and store should be the same type.
+ if (MemVT != LoadVT)
+ return;
+ }
auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
int64_t &Offset) -> bool {
if (Other->isVolatile() || Other->isIndexed())
return false;
- // We can merge constant floats to equivalent integers
- if (Other->getMemoryVT() != MemVT)
- if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) &&
- isa<ConstantFPSDNode>(Other->getValue())))
- return false;
+ SDValue Val = peekThroughBitcast(Other->getValue());
+ // Allow merging constants of different types as integers.
+ bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
+ : Other->getMemoryVT() != MemVT;
if (IsLoadSrc) {
+ if (NoTypeMatch)
+ return false;
// The Load's Base Ptr must also match
- if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Other->getValue())) {
+ if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Val)) {
auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG);
+ if (LoadVT != OtherLd->getMemoryVT())
+ return false;
if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
return false;
} else
return false;
}
- if (IsConstantSrc)
- if (!(isa<ConstantSDNode>(Other->getValue()) ||
- isa<ConstantFPSDNode>(Other->getValue())))
+ if (IsConstantSrc) {
+ if (NoTypeMatch)
return false;
- if (IsExtractVecSrc)
- if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
- Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR))
+ if (!(isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val)))
+ return false;
+ }
+ if (IsExtractVecSrc) {
+ // Do not merge truncated stores here.
+ if (Other->isTruncatingStore())
return false;
+ if (!MemVT.bitsEq(Val.getValueType()))
+ return false;
+ if (Val.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
+ Val.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return false;
+ }
Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
};
+
// We looking for a root node which is an ancestor to all mergable
// stores. We search up through a load, to our root and then down
// through all children. For instance we will find Store{1,2,3} if
@@ -12612,10 +13044,8 @@ void DAGCombiner::getStoreMergeCandidates(
// indirectly through its operand (we already consider dependencies
// through the chain). Check in parallel by searching up from
// non-chain operands of candidates.
-
bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) {
-
// FIXME: We should be able to truncate a full search of
// predecessors by doing a BFS and keeping tabs the originating
// stores from which worklist nodes come from in a similar way to
@@ -12648,12 +13078,13 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
return false;
EVT MemVT = St->getMemoryVT();
- int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
+ int64_t ElementSizeBytes = MemVT.getStoreSize();
+ unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
return false;
- bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute(
+ bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
// This function cannot currently deal with non-byte-sized memory sizes.
@@ -12665,7 +13096,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
// Perform an early exit check. Do not bother looking at stored values that
// are not constants, loads, or extracted vector elements.
- SDValue StoredVal = St->getValue();
+ SDValue StoredVal = peekThroughBitcast(St->getValue());
bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
isa<ConstantFPSDNode>(StoredVal);
@@ -12675,12 +13106,6 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
return false;
- // Don't merge vectors into wider vectors if the source data comes from loads.
- // TODO: This restriction can be lifted by using logic similar to the
- // ExtractVecSrc case.
- if (MemVT.isVector() && IsLoadSrc)
- return false;
-
SmallVector<MemOpLink, 8> StoreNodes;
// Find potential store merge candidates by searching through chain sub-DAG
getStoreMergeCandidates(St, StoreNodes);
@@ -12759,19 +13184,20 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
unsigned LastLegalVectorType = 1;
bool LastIntegerTrunc = false;
bool NonZero = false;
+ unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
SDValue StoredVal = ST->getValue();
-
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) {
- NonZero |= !C->isNullValue();
- } else if (ConstantFPSDNode *C =
- dyn_cast<ConstantFPSDNode>(StoredVal)) {
- NonZero |= !C->getConstantFPValue()->isNullValue();
- } else {
- // Non-constant.
- break;
+ bool IsElementZero = false;
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
+ IsElementZero = C->isNullValue();
+ else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
+ IsElementZero = C->getConstantFPValue()->isNullValue();
+ if (IsElementZero) {
+ if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
+ FirstZeroAfterNonZero = i;
}
+ NonZero |= !IsElementZero;
// Find a legal type for the constant store.
unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
@@ -12791,8 +13217,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) &&
- TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
- FirstStoreAS, FirstStoreAlign, &IsFast) &&
+ TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+ FirstStoreAlign, &IsFast) &&
IsFast) {
LastIntegerTrunc = true;
LastLegalType = i + 1;
@@ -12806,13 +13232,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
!NoVectors) {
// Find a legal type for the vector store.
- unsigned Elts = i + 1;
- if (MemVT.isVector()) {
- // When merging vector stores, get the total number of elements.
- Elts *= MemVT.getVectorNumElements();
- }
+ unsigned Elts = (i + 1) * NumMemElts;
EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
- if (TLI.isTypeLegal(Ty) &&
+ if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
FirstStoreAlign, &IsFast) &&
@@ -12821,23 +13243,34 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
}
}
+ bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
+ unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
+
// Check if we found a legal integer type that creates a meaningful merge.
- if (LastLegalType < 2 && LastLegalVectorType < 2) {
- StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
+ if (NumElem < 2) {
+ // We know that candidate stores are in order and of correct
+ // shape. While there is no mergeable sequence from the
+ // beginning one may start later in the sequence. The only
+ // reason a merge of size N could have failed where another of
+ // the same size would not have, is if the alignment has
+ // improved or we've dropped a non-zero value. Drop as many
+ // candidates as we can here.
+ unsigned NumSkip = 1;
+ while (
+ (NumSkip < NumConsecutiveStores) &&
+ (NumSkip < FirstZeroAfterNonZero) &&
+ (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) {
+ NumSkip++;
+ }
+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
continue;
}
- bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
- unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
-
bool Merged = MergeStoresOfConstantsOrVecElts(
StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc);
- if (!Merged) {
- StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
- continue;
- }
+ RV |= Merged;
+
// Remove merged stores for next iteration.
- RV = true;
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
continue;
}
@@ -12849,25 +13282,20 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
unsigned FirstStoreAS = FirstInChain->getAddressSpace();
unsigned FirstStoreAlign = FirstInChain->getAlignment();
unsigned NumStoresToMerge = 1;
- bool IsVec = MemVT.isVector();
for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
- unsigned StoreValOpcode = St->getValue().getOpcode();
+ SDValue StVal = peekThroughBitcast(St->getValue());
// This restriction could be loosened.
// Bail out if any stored values are not elements extracted from a
// vector. It should be possible to handle mixed sources, but load
// sources need more careful handling (see the block of code below that
// handles consecutive loads).
- if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT &&
- StoreValOpcode != ISD::EXTRACT_SUBVECTOR)
+ if (StVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
+ StVal.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return RV;
// Find a legal type for the vector store.
- unsigned Elts = i + 1;
- if (IsVec) {
- // When merging vector stores, get the total number of elements.
- Elts *= MemVT.getVectorNumElements();
- }
+ unsigned Elts = (i + 1) * NumMemElts;
EVT Ty =
EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
bool IsFast;
@@ -12879,6 +13307,23 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
NumStoresToMerge = i + 1;
}
+ // Check if we found a legal integer type that creates a meaningful merge.
+ if (NumStoresToMerge < 2) {
+ // We know that candidate stores are in order and of correct
+ // shape. While there is no mergeable sequence from the
+ // beginning one may start later in the sequence. The only
+ // reason a merge of size N could have failed where another of
+ // the same size would not have, is if the alignment has
+ // improved. Drop as many candidates as we can here.
+ unsigned NumSkip = 1;
+ while ((NumSkip < NumConsecutiveStores) &&
+ (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+ NumSkip++;
+
+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
+ continue;
+ }
+
bool Merged = MergeStoresOfConstantsOrVecElts(
StoreNodes, MemVT, NumStoresToMerge, false, true, false);
if (!Merged) {
@@ -12905,7 +13350,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
BaseIndexOffset LdBasePtr;
for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
- LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
+ SDValue Val = peekThroughBitcast(St->getValue());
+ LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val);
if (!Ld)
break;
@@ -12917,10 +13363,6 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
if (Ld->isVolatile() || Ld->isIndexed())
break;
- // We do not accept ext loads.
- if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
- break;
-
// The stored memory type must be the same.
if (Ld->getMemoryVT() != MemVT)
break;
@@ -12986,7 +13428,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
isDereferenceable = false;
// Find a legal type for the vector store.
- EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1);
+ unsigned Elts = (i + 1) * NumMemElts;
+ EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+
bool IsFastSt, IsFastLd;
if (TLI.isTypeLegal(StoreTy) &&
TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
@@ -13023,8 +13467,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy,
StoreTy) &&
TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
- TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
- FirstStoreAS, FirstStoreAlign, &IsFastSt) &&
+ TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+ FirstStoreAlign, &IsFastSt) &&
IsFastSt &&
TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
FirstLoadAlign, &IsFastLd) &&
@@ -13047,7 +13491,19 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
NumElem = std::min(LastLegalType, NumElem);
if (NumElem < 2) {
- StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
+ // We know that candidate stores are in order and of correct
+ // shape. While there is no mergeable sequence from the
+ // beginning one may start later in the sequence. The only
+ // reason a merge of size N could have failed where another of
+ // the same size would not have is if the alignment or either
+ // the load or store has improved. Drop as many candidates as we
+ // can here.
+ unsigned NumSkip = 1;
+ while ((NumSkip < LoadNodes.size()) &&
+ (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
+ (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+ NumSkip++;
+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
continue;
}
@@ -13055,7 +13511,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
// to memory.
EVT JointMemOpVT;
if (UseVectorTy) {
- JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem);
+ // Find a legal type for the vector store.
+ unsigned Elts = NumElem * NumMemElts;
+ JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
} else {
unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
@@ -13104,12 +13562,17 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
SDValue(NewLoad.getNode(), 1));
}
- // Replace the all stores with the new store.
- for (unsigned i = 0; i < NumElem; ++i)
+ // Replace the all stores with the new store. Recursively remove
+ // corresponding value if its no longer used.
+ for (unsigned i = 0; i < NumElem; ++i) {
+ SDValue Val = StoreNodes[i].MemNode->getOperand(1);
CombineTo(StoreNodes[i].MemNode, NewStore);
+ if (Val.getNode()->use_empty())
+ recursivelyDeleteUnusedNodes(Val.getNode());
+ }
+
RV = true;
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
- continue;
}
return RV;
}
@@ -13284,7 +13747,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
// See if we can simplify the input to this truncstore with knowledge that
// only the low bits are being used. For example:
// "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8"
- SDValue Shorter = GetDemandedBits(
+ SDValue Shorter = DAG.GetDemandedBits(
Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
ST->getMemoryVT().getScalarSizeInBits()));
AddToWorklist(Value.getNode());
@@ -13356,11 +13819,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
Ptr, ST->getMemoryVT(), ST->getMemOperand());
}
- // Only perform this optimization before the types are legal, because we
- // don't want to perform this optimization on every DAGCombine invocation.
- if ((TLI.mergeStoresAfterLegalization()) ? Level == AfterLegalizeDAG
- : !LegalTypes) {
- for (;;) {
+ // Always perform this optimization before types are legal. If the target
+ // prefers, also try this after legalization to catch stores that were created
+ // by intrinsics or other nodes.
+ if (!LegalTypes || (TLI.mergeStoresAfterLegalization())) {
+ while (true) {
// There can be multiple store sequences on the same chain.
// Keep trying to merge store sequences until we are unable to do so
// or until we merge the last store on the chain.
@@ -13499,6 +13962,60 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
return St1;
}
+/// Convert a disguised subvector insertion into a shuffle:
+/// insert_vector_elt V, (bitcast X from vector type), IdxC -->
+/// bitcast(shuffle (bitcast V), (extended X), Mask)
+/// Note: We do not use an insert_subvector node because that requires a legal
+/// subvector type.
+SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
+ SDValue InsertVal = N->getOperand(1);
+ if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
+ !InsertVal.getOperand(0).getValueType().isVector())
+ return SDValue();
+
+ SDValue SubVec = InsertVal.getOperand(0);
+ SDValue DestVec = N->getOperand(0);
+ EVT SubVecVT = SubVec.getValueType();
+ EVT VT = DestVec.getValueType();
+ unsigned NumSrcElts = SubVecVT.getVectorNumElements();
+ unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
+ unsigned NumMaskVals = ExtendRatio * NumSrcElts;
+
+ // Step 1: Create a shuffle mask that implements this insert operation. The
+ // vector that we are inserting into will be operand 0 of the shuffle, so
+ // those elements are just 'i'. The inserted subvector is in the first
+ // positions of operand 1 of the shuffle. Example:
+ // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
+ SmallVector<int, 16> Mask(NumMaskVals);
+ for (unsigned i = 0; i != NumMaskVals; ++i) {
+ if (i / NumSrcElts == InsIndex)
+ Mask[i] = (i % NumSrcElts) + NumMaskVals;
+ else
+ Mask[i] = i;
+ }
+
+ // Bail out if the target can not handle the shuffle we want to create.
+ EVT SubVecEltVT = SubVecVT.getVectorElementType();
+ EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
+ if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
+ return SDValue();
+
+ // Step 2: Create a wide vector from the inserted source vector by appending
+ // undefined elements. This is the same size as our destination vector.
+ SDLoc DL(N);
+ SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
+ ConcatOps[0] = SubVec;
+ SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);
+
+ // Step 3: Shuffle in the padded subvector.
+ SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
+ SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
+ AddToWorklist(PaddedSubV.getNode());
+ AddToWorklist(DestVecBC.getNode());
+ AddToWorklist(Shuf.getNode());
+ return DAG.getBitcast(VT, Shuf);
+}
+
SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
SDValue InVec = N->getOperand(0);
SDValue InVal = N->getOperand(1);
@@ -13511,10 +14028,20 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
EVT VT = InVec.getValueType();
- // Check that we know which element is being inserted
- if (!isa<ConstantSDNode>(EltNo))
+ // Remove redundant insertions:
+ // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
+ if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
+ return InVec;
+
+ // We must know which element is being inserted for folds below here.
+ auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
+ if (!IndexC)
return SDValue();
- unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+ unsigned Elt = IndexC->getZExtValue();
+
+ if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
+ return Shuf;
// Canonicalize insert_vector_elt dag nodes.
// Example:
@@ -13692,9 +14219,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
// converts.
}
- // extract_vector_elt (v2i32 (bitcast i64:x)), 0 -> i32 (trunc i64:x)
+ // extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x)
+ bool isLE = DAG.getDataLayout().isLittleEndian();
+ unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1;
if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() &&
- ConstEltNo->isNullValue() && VT.isInteger()) {
+ ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) {
SDValue BCSrc = InVec.getOperand(0);
if (BCSrc.getValueType().isScalarInteger())
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
@@ -13748,7 +14277,10 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
// FIXME: We should handle recursing on other vector shuffles and
// scalar_to_vector here as well.
- if (!LegalOperations) {
+ if (!LegalOperations ||
+ // FIXME: Should really be just isOperationLegalOrCustom.
+ TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VT) ||
+ TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VT)) {
EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec,
DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy));
@@ -14054,10 +14586,18 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
EVT InVT1 = VecIn1.getValueType();
EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;
- unsigned Vec2Offset = InVT1.getVectorNumElements();
+ unsigned Vec2Offset = 0;
unsigned NumElems = VT.getVectorNumElements();
unsigned ShuffleNumElems = NumElems;
+ // In case both the input vectors are extracted from same base
+ // vector we do not need extra addend (Vec2Offset) while
+ // computing shuffle mask.
+ if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) ||
+ !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) ||
+ !(VecIn1.getOperand(0) == VecIn2.getOperand(0)))
+ Vec2Offset = InVT1.getVectorNumElements();
+
// We can't generate a shuffle node with mismatched input and output types.
// Try to make the types match the type of the output.
if (InVT1 != VT || InVT2 != VT) {
@@ -14072,7 +14612,7 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
VecIn2 = SDValue();
} else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) {
- if (!TLI.isExtractSubvectorCheap(VT, NumElems))
+ if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
return SDValue();
if (!VecIn2.getNode()) {
@@ -14204,7 +14744,6 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Op.getOperand(1)))
return SDValue();
-
SDValue ExtractedFromVec = Op.getOperand(0);
// All inputs must have the same element type as the output.
@@ -14227,6 +14766,50 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
if (VecIn.size() < 2)
return SDValue();
+ // If all the Operands of BUILD_VECTOR extract from same
+ // vector, then split the vector efficiently based on the maximum
+ // vector access index and adjust the VectorMask and
+ // VecIn accordingly.
+ if (VecIn.size() == 2) {
+ unsigned MaxIndex = 0;
+ unsigned NearestPow2 = 0;
+ SDValue Vec = VecIn.back();
+ EVT InVT = Vec.getValueType();
+ MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+ SmallVector<unsigned, 8> IndexVec(NumElems, 0);
+
+ for (unsigned i = 0; i < NumElems; i++) {
+ if (VectorMask[i] <= 0)
+ continue;
+ unsigned Index = N->getOperand(i).getConstantOperandVal(1);
+ IndexVec[i] = Index;
+ MaxIndex = std::max(MaxIndex, Index);
+ }
+
+ NearestPow2 = PowerOf2Ceil(MaxIndex);
+ if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 &&
+ NumElems * 2 < NearestPow2) {
+ unsigned SplitSize = NearestPow2 / 2;
+ EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
+ InVT.getVectorElementType(), SplitSize);
+ if (TLI.isTypeLegal(SplitVT)) {
+ SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
+ DAG.getConstant(SplitSize, DL, IdxTy));
+ SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
+ DAG.getConstant(0, DL, IdxTy));
+ VecIn.pop_back();
+ VecIn.push_back(VecIn1);
+ VecIn.push_back(VecIn2);
+
+ for (unsigned i = 0; i < NumElems; i++) {
+ if (VectorMask[i] <= 0)
+ continue;
+ VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2;
+ }
+ }
+ }
+ }
+
// TODO: We want to sort the vectors by descending length, so that adjacent
// pairs have similar length, and the longer vector is always first in the
// pair.
@@ -14315,77 +14898,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask);
}
}
-
return Shuffles[0];
}
-// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
-// operations which can be matched to a truncate.
-SDValue DAGCombiner::reduceBuildVecToTrunc(SDNode *N) {
- // TODO: Add support for big-endian.
- if (DAG.getDataLayout().isBigEndian())
- return SDValue();
- if (N->getNumOperands() < 2)
- return SDValue();
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
- unsigned NumElems = N->getNumOperands();
-
- if (!isTypeLegal(VT))
- return SDValue();
-
- // If the input is something other than an EXTRACT_VECTOR_ELT with a constant
- // index, bail out.
- // TODO: Allow undef elements in some cases?
- if (any_of(N->ops(), [VT](SDValue Op) {
- return Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- !isa<ConstantSDNode>(Op.getOperand(1)) ||
- Op.getValueType() != VT.getVectorElementType();
- }))
- return SDValue();
-
- // Helper for obtaining an EXTRACT_VECTOR_ELT's constant index
- auto GetExtractIdx = [](SDValue Extract) {
- return cast<ConstantSDNode>(Extract.getOperand(1))->getSExtValue();
- };
-
- // The first BUILD_VECTOR operand must be an an extract from index zero
- // (assuming no undef and little-endian).
- if (GetExtractIdx(N->getOperand(0)) != 0)
- return SDValue();
-
- // Compute the stride from the first index.
- int Stride = GetExtractIdx(N->getOperand(1));
- SDValue ExtractedFromVec = N->getOperand(0).getOperand(0);
-
- // Proceed only if the stride and the types can be matched to a truncate.
- if ((Stride == 1 || !isPowerOf2_32(Stride)) ||
- (ExtractedFromVec.getValueType().getVectorNumElements() !=
- Stride * NumElems) ||
- (VT.getScalarSizeInBits() * Stride > 64))
- return SDValue();
-
- // Check remaining operands are consistent with the computed stride.
- for (unsigned i = 1; i != NumElems; ++i) {
- SDValue Op = N->getOperand(i);
-
- if ((Op.getOperand(0) != ExtractedFromVec) ||
- (GetExtractIdx(Op) != Stride * i))
- return SDValue();
- }
-
- // All checks were ok, construct the truncate.
- LLVMContext &Ctx = *DAG.getContext();
- EVT NewVT = VT.getVectorVT(
- Ctx, EVT::getIntegerVT(Ctx, VT.getScalarSizeInBits() * Stride), NumElems);
- EVT TruncVT =
- VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT;
-
- SDValue Res = DAG.getBitcast(NewVT, ExtractedFromVec);
- Res = DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, Res);
- return DAG.getBitcast(VT, Res);
-}
-
SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
EVT VT = N->getValueType(0);
@@ -14428,10 +14943,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N))
return V;
- if (TLI.isDesirableToCombineBuildVectorToTruncate())
- if (SDValue V = reduceBuildVecToTrunc(N))
- return V;
-
if (SDValue V = reduceBuildVecToShuffle(N))
return V;
@@ -14514,8 +15025,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
for (SDValue Op : N->ops()) {
// Peek through any bitcast.
- while (Op.getOpcode() == ISD::BITCAST)
- Op = Op.getOperand(0);
+ Op = peekThroughBitcast(Op);
// UNDEF nodes convert to UNDEF shuffle mask values.
if (Op.isUndef()) {
@@ -14534,8 +15044,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
EVT ExtVT = ExtVec.getValueType();
// Peek through any bitcast.
- while (ExtVec.getOpcode() == ISD::BITCAST)
- ExtVec = ExtVec.getOperand(0);
+ ExtVec = peekThroughBitcast(ExtVec);
// UNDEF nodes convert to UNDEF shuffle mask values.
if (ExtVec.isUndef()) {
@@ -14760,9 +15269,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
// We are looking for an optionally bitcasted wide vector binary operator
// feeding an extract subvector.
- SDValue BinOp = Extract->getOperand(0);
- if (BinOp.getOpcode() == ISD::BITCAST)
- BinOp = BinOp.getOperand(0);
+ SDValue BinOp = peekThroughBitcast(Extract->getOperand(0));
// TODO: The motivating case for this transform is an x86 AVX1 target. That
// target has temptingly almost legal versions of bitwise logic ops in 256-bit
@@ -14786,13 +15293,8 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
return SDValue();
// Peek through bitcasts of the binary operator operands if needed.
- SDValue LHS = BinOp.getOperand(0);
- if (LHS.getOpcode() == ISD::BITCAST)
- LHS = LHS.getOperand(0);
-
- SDValue RHS = BinOp.getOperand(1);
- if (RHS.getOpcode() == ISD::BITCAST)
- RHS = RHS.getOperand(0);
+ SDValue LHS = peekThroughBitcast(BinOp.getOperand(0));
+ SDValue RHS = peekThroughBitcast(BinOp.getOperand(1));
// We need at least one concatenation operation of a binop operand to make
// this transform worthwhile. The concat must double the input vector sizes.
@@ -14891,8 +15393,34 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
}
// Skip bitcasting
- if (V->getOpcode() == ISD::BITCAST)
- V = V.getOperand(0);
+ V = peekThroughBitcast(V);
+
+ // If the input is a build vector. Try to make a smaller build vector.
+ if (V->getOpcode() == ISD::BUILD_VECTOR) {
+ if (auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ EVT InVT = V->getValueType(0);
+ unsigned ExtractSize = NVT.getSizeInBits();
+ unsigned EltSize = InVT.getScalarSizeInBits();
+ // Only do this if we won't split any elements.
+ if (ExtractSize % EltSize == 0) {
+ unsigned NumElems = ExtractSize / EltSize;
+ EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
+ InVT.getVectorElementType(), NumElems);
+ if ((!LegalOperations ||
+ TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT)) &&
+ (!LegalTypes || TLI.isTypeLegal(ExtractVT))) {
+ unsigned IdxVal = (Idx->getZExtValue() * NVT.getScalarSizeInBits()) /
+ EltSize;
+
+ // Extract the pieces from the original build_vector.
+ SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
+ makeArrayRef(V->op_begin() + IdxVal,
+ NumElems));
+ return DAG.getBitcast(NVT, BuildVec);
+ }
+ }
+ }
+ }
if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
// Handle only simple case where vector being inserted and vector
@@ -15013,6 +15541,37 @@ static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
}
+static SDValue simplifyShuffleMask(ShuffleVectorSDNode *SVN, SDValue N0,
+ SDValue N1, SelectionDAG &DAG) {
+ auto isUndefElt = [](SDValue V, int Idx) {
+ // TODO - handle more cases as required.
+ if (V.getOpcode() == ISD::BUILD_VECTOR)
+ return V.getOperand(Idx).isUndef();
+ if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+ return (Idx != 0) || V.getOperand(0).isUndef();
+ return false;
+ };
+
+ EVT VT = SVN->getValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+
+ bool Changed = false;
+ SmallVector<int, 8> NewMask;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int Idx = SVN->getMaskElt(i);
+ if ((0 <= Idx && Idx < (int)NumElts && isUndefElt(N0, Idx)) ||
+ ((int)NumElts < Idx && isUndefElt(N1, Idx - NumElts))) {
+ Changed = true;
+ Idx = -1;
+ }
+ NewMask.push_back(Idx);
+ }
+ if (Changed)
+ return DAG.getVectorShuffle(VT, SDLoc(SVN), N0, N1, NewMask);
+
+ return SDValue();
+}
+
// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
// or turn a shuffle of a single concat into simpler shuffle then concat.
static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
@@ -15091,7 +15650,7 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
//
// To deal with this, we currently use a bunch of mostly arbitrary heuristics.
// We don't fold shuffles where one side is a non-zero constant, and we don't
-// fold shuffles if the resulting BUILD_VECTOR would have duplicate
+// fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate
// non-constant operands. This seems to work out reasonably well in practice.
static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG,
@@ -15103,6 +15662,7 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
if (!N0->hasOneUse() || !N1->hasOneUse())
return SDValue();
+
// If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
// discussed above.
if (!N1.isUndef()) {
@@ -15114,6 +15674,15 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
return SDValue();
}
+ // If both inputs are splats of the same value then we can safely merge this
+ // to a single BUILD_VECTOR with undef elements based on the shuffle mask.
+ bool IsSplat = false;
+ auto *BV0 = dyn_cast<BuildVectorSDNode>(N0);
+ auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
+ if (BV0 && BV1)
+ if (SDValue Splat0 = BV0->getSplatValue())
+ IsSplat = (Splat0 == BV1->getSplatValue());
+
SmallVector<SDValue, 8> Ops;
SmallSet<SDValue, 16> DuplicateOps;
for (int M : SVN->getMask()) {
@@ -15124,23 +15693,25 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
if (S.getOpcode() == ISD::BUILD_VECTOR) {
Op = S.getOperand(Idx);
} else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) {
- if (Idx == 0)
- Op = S.getOperand(0);
+ assert(Idx == 0 && "Unexpected SCALAR_TO_VECTOR operand index.");
+ Op = S.getOperand(0);
} else {
// Operand can't be combined - bail out.
return SDValue();
}
}
- // Don't duplicate a non-constant BUILD_VECTOR operand; semantically, this is
- // fine, but it's likely to generate low-quality code if the target can't
- // reconstruct an appropriate shuffle.
+ // Don't duplicate a non-constant BUILD_VECTOR operand unless we're
+ // generating a splat; semantically, this is fine, but it's likely to
+ // generate low-quality code if the target can't reconstruct an appropriate
+ // shuffle.
if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op))
- if (!DuplicateOps.insert(Op).second)
+ if (!IsSplat && !DuplicateOps.insert(Op).second)
return SDValue();
Ops.push_back(Op);
}
+
// BUILD_VECTOR requires all inputs to be of the same type, find the
// maximum type and extend them all.
EVT SVT = VT.getScalarType();
@@ -15162,7 +15733,8 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG,
const TargetLowering &TLI,
- bool LegalOperations) {
+ bool LegalOperations,
+ bool LegalTypes) {
EVT VT = SVN->getValueType(0);
bool IsBigEndian = DAG.getDataLayout().isBigEndian();
@@ -15190,14 +15762,18 @@ static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
// Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
// power-of-2 extensions as they are the most likely.
for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
+ // Check for non power of 2 vector sizes
+ if (NumElts % Scale != 0)
+ continue;
if (!isAnyExtend(Scale))
continue;
EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale);
EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
- if (!LegalOperations ||
- TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
- return DAG.getBitcast(VT,
+ if (!LegalTypes || TLI.isTypeLegal(OutVT))
+ if (!LegalOperations ||
+ TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
+ return DAG.getBitcast(VT,
DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT));
}
@@ -15218,9 +15794,7 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
if (!VT.isInteger() || IsBigEndian)
return SDValue();
- SDValue N0 = SVN->getOperand(0);
- while (N0.getOpcode() == ISD::BITCAST)
- N0 = N0.getOperand(0);
+ SDValue N0 = peekThroughBitcast(SVN->getOperand(0));
unsigned Opcode = N0.getOpcode();
if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG &&
@@ -15316,6 +15890,84 @@ static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask,
NewMask);
}
+/// If the shuffle mask is taking exactly one element from the first vector
+/// operand and passing through all other elements from the second vector
+/// operand, return the index of the mask element that is choosing an element
+/// from the first operand. Otherwise, return -1.
+static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) {
+ int MaskSize = Mask.size();
+ int EltFromOp0 = -1;
+ // TODO: This does not match if there are undef elements in the shuffle mask.
+ // Should we ignore undefs in the shuffle mask instead? The trade-off is
+ // removing an instruction (a shuffle), but losing the knowledge that some
+ // vector lanes are not needed.
+ for (int i = 0; i != MaskSize; ++i) {
+ if (Mask[i] >= 0 && Mask[i] < MaskSize) {
+ // We're looking for a shuffle of exactly one element from operand 0.
+ if (EltFromOp0 != -1)
+ return -1;
+ EltFromOp0 = i;
+ } else if (Mask[i] != i + MaskSize) {
+ // Nothing from operand 1 can change lanes.
+ return -1;
+ }
+ }
+ return EltFromOp0;
+}
+
+/// If a shuffle inserts exactly one element from a source vector operand into
+/// another vector operand and we can access the specified element as a scalar,
+/// then we can eliminate the shuffle.
+static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
+ SelectionDAG &DAG) {
+ // First, check if we are taking one element of a vector and shuffling that
+ // element into another vector.
+ ArrayRef<int> Mask = Shuf->getMask();
+ SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end());
+ SDValue Op0 = Shuf->getOperand(0);
+ SDValue Op1 = Shuf->getOperand(1);
+ int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask);
+ if (ShufOp0Index == -1) {
+ // Commute mask and check again.
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+ ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask);
+ if (ShufOp0Index == -1)
+ return SDValue();
+ // Commute operands to match the commuted shuffle mask.
+ std::swap(Op0, Op1);
+ Mask = CommutedMask;
+ }
+
+ // The shuffle inserts exactly one element from operand 0 into operand 1.
+ // Now see if we can access that element as a scalar via a real insert element
+ // instruction.
+ // TODO: We can try harder to locate the element as a scalar. Examples: it
+ // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant.
+ assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() &&
+ "Shuffle mask value must be from operand 0");
+ if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT)
+ return SDValue();
+
+ auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2));
+ if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index])
+ return SDValue();
+
+ // There's an existing insertelement with constant insertion index, so we
+ // don't need to check the legality/profitability of a replacement operation
+ // that differs at most in the constant value. The target should be able to
+ // lower any of those in a similar way. If not, legalization will expand this
+ // to a scalar-to-vector plus shuffle.
+ //
+ // Note that the shuffle may move the scalar from the position that the insert
+ // element used. Therefore, our new insert element occurs at the shuffle's
+ // mask index value, not the insert's index value.
+ // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
+ SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),
+ Op0.getOperand(2).getValueType());
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
+ Op1, Op0.getOperand(1), NewInsIndex);
+}
+
SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
EVT VT = N->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
@@ -15362,6 +16014,13 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
}
+ // Simplify shuffle mask if a referenced element is UNDEF.
+ if (SDValue V = simplifyShuffleMask(SVN, N0, N1, DAG))
+ return V;
+
+ if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
+ return InsElt;
+
// A shuffle of a single vector that is a splat can always be folded.
if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))
if (N1->isUndef() && N0Shuf->isSplat())
@@ -15426,7 +16085,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
return S;
// Match shuffles that can be converted to any_vector_extend_in_reg.
- if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))
+ if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes))
return V;
// Combine "truncate_vector_in_reg" style shuffles.
@@ -15486,7 +16145,6 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
if (TLI.isTypeLegal(ScaleVT) &&
0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
-
int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();
@@ -15661,23 +16319,46 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
EVT VT = N->getValueType(0);
// Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern
- // with a VECTOR_SHUFFLE.
+ // with a VECTOR_SHUFFLE and possible truncate.
if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue InVec = InVal->getOperand(0);
SDValue EltNo = InVal->getOperand(1);
-
- // FIXME: We could support implicit truncation if the shuffle can be
- // scaled to a smaller vector scalar type.
- ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo);
- if (C0 && VT == InVec.getValueType() &&
- VT.getScalarType() == InVal.getValueType()) {
- SmallVector<int, 8> NewMask(VT.getVectorNumElements(), -1);
+ auto InVecT = InVec.getValueType();
+ if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) {
+ SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);
int Elt = C0->getZExtValue();
NewMask[0] = Elt;
-
- if (TLI.isShuffleMaskLegal(NewMask, VT))
- return DAG.getVectorShuffle(VT, SDLoc(N), InVec, DAG.getUNDEF(VT),
- NewMask);
+ SDValue Val;
+ // If we have an implict truncate do truncate here as long as it's legal.
+ // if it's not legal, this should
+ if (VT.getScalarType() != InVal.getValueType() &&
+ InVal.getValueType().isScalarInteger() &&
+ isTypeLegal(VT.getScalarType())) {
+ Val =
+ DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal);
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
+ }
+ if (VT.getScalarType() == InVecT.getScalarType() &&
+ VT.getVectorNumElements() <= InVecT.getVectorNumElements() &&
+ TLI.isShuffleMaskLegal(NewMask, VT)) {
+ Val = DAG.getVectorShuffle(InVecT, SDLoc(N), InVec,
+ DAG.getUNDEF(InVecT), NewMask);
+ // If the initial vector is the correct size this shuffle is a
+ // valid result.
+ if (VT == InVecT)
+ return Val;
+ // If not we must truncate the vector.
+ if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
+ MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+ SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy);
+ EVT SubVT =
+ EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(),
+ VT.getVectorNumElements());
+ Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, Val,
+ ZeroIdx);
+ return Val;
+ }
+ }
}
}
@@ -15694,12 +16375,47 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
if (N1.isUndef())
return N0;
+ // For nested INSERT_SUBVECTORs, attempt to combine inner node first to allow
+ // us to pull BITCASTs from input to output.
+ if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR)
+ if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode()))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2);
+
// If this is an insert of an extracted vector into an undef vector, we can
// just use the input to the extract.
if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
return N1.getOperand(0);
+ // If we are inserting a bitcast value into an undef, with the same
+ // number of elements, just use the bitcast input of the extract.
+ // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 ->
+ // BITCAST (INSERT_SUBVECTOR UNDEF N1 N2)
+ if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST &&
+ N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N1.getOperand(0).getOperand(1) == N2 &&
+ N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
+ VT.getVectorNumElements()) {
+ return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
+ }
+
+ // If both N1 and N2 are bitcast values on which insert_subvector
+ // would makes sense, pull the bitcast through.
+ // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 ->
+ // BITCAST (INSERT_SUBVECTOR N0 N1 N2)
+ if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
+ SDValue CN0 = N0.getOperand(0);
+ SDValue CN1 = N1.getOperand(0);
+ if (CN0.getValueType().getVectorElementType() ==
+ CN1.getValueType().getVectorElementType() &&
+ CN0.getValueType().getVectorNumElements() ==
+ VT.getVectorNumElements()) {
+ SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
+ CN0.getValueType(), CN0, CN1, N2);
+ return DAG.getBitcast(VT, NewINSERT);
+ }
+ }
+
// Combine INSERT_SUBVECTORs where we are inserting to the same index.
// INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
// --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
@@ -15779,7 +16495,7 @@ SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
+ SDValue RHS = peekThroughBitcast(N->getOperand(1));
SDLoc DL(N);
// Make sure we're not running after operation legalization where it
@@ -15790,9 +16506,6 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
if (N->getOpcode() != ISD::AND)
return SDValue();
- if (RHS.getOpcode() == ISD::BITCAST)
- RHS = RHS.getOperand(0);
-
if (RHS.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
@@ -15945,7 +16658,6 @@ SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
/// the DAG combiner loop to avoid it being looked at.
bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
SDValue RHS) {
-
// fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
// The select + setcc is redundant, because fsqrt returns NaN for X < 0.
if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
@@ -16418,7 +17130,7 @@ SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
SDValue DAGCombiner::BuildSDIV(SDNode *N) {
// when optimising for minimum size, we don't want to expand a div to a mul
// and a shift.
- if (DAG.getMachineFunction().getFunction()->optForMinSize())
+ if (DAG.getMachineFunction().getFunction().optForMinSize())
return SDValue();
ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
@@ -16429,7 +17141,7 @@ SDValue DAGCombiner::BuildSDIV(SDNode *N) {
if (C->isNullValue())
return SDValue();
- std::vector<SDNode*> Built;
+ std::vector<SDNode *> Built;
SDValue S =
TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);
@@ -16464,7 +17176,7 @@ SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
SDValue DAGCombiner::BuildUDIV(SDNode *N) {
// when optimising for minimum size, we don't want to expand a div to a mul
// and a shift.
- if (DAG.getMachineFunction().getFunction()->optForMinSize())
+ if (DAG.getMachineFunction().getFunction().optForMinSize())
return SDValue();
ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
@@ -16475,7 +17187,7 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {
if (C->isNullValue())
return SDValue();
- std::vector<SDNode*> Built;
+ std::vector<SDNode *> Built;
SDValue S =
TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);
@@ -16760,8 +17472,8 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
if (Op1->isInvariant() && Op0->writeMem())
return false;
- unsigned NumBytes0 = Op0->getMemoryVT().getSizeInBits() >> 3;
- unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3;
+ unsigned NumBytes0 = Op0->getMemoryVT().getStoreSize();
+ unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize();
// Check for BaseIndexOffset matching.
BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG);
@@ -16957,7 +17669,11 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
/// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
/// (aliasing node.)
SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
- SmallVector<SDValue, 8> Aliases; // Ops for replacing token factor.
+ if (OptLevel == CodeGenOpt::None)
+ return OldChain;
+
+ // Ops for replacing token factor.
+ SmallVector<SDValue, 8> Aliases;
// Accumulate all the aliases to this node.
GatherAllAliases(N, OldChain, Aliases);
@@ -16987,6 +17703,9 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
// to go from a partially-merged state to the desired final
// fully-merged state.
bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
+ if (OptLevel == CodeGenOpt::None)
+ return false;
+
// This holds the base pointer, index, and the offset in bytes from the base
// pointer.
BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);