src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2020-07-26 19:36:28 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-07-26 19:36:28 +0000
commit	cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree	209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
parent	706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)

vendor/llvm-project/llvmorg-11-init-20887-g2e10b7a39b9 vendor/llvm-project/master

Notes

Diffstat (limited to 'llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp')

-rw-r--r--

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

3368

1 files changed, 2132 insertions, 1236 deletions

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e5bc08b9280a..f14b3dba4f31 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

@@ -30,6 +30,7 @@

#include "llvm/ADT/Statistic.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/MemoryLocation.h"

+#include "llvm/Analysis/VectorUtils.h"

#include "llvm/CodeGen/DAGCombine.h"

#include "llvm/CodeGen/ISDOpcodes.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

@@ -124,17 +125,29 @@ static cl::opt<unsigned> StoreMergeDependenceLimit(

cl::desc("Limit the number of times for the same StoreNode and RootNode "

"to bail out in store merging dependence check"));

+static cl::opt<bool> EnableReduceLoadOpStoreWidth(

+ "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),

+ cl::desc("DAG cominber enable reducing the width of load/op/store "

+ "sequence"));

+static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(

+ "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),

+ cl::desc("DAG cominber enable load/<replace bytes>/store with "

+ "a narrower store"));

namespace {

class DAGCombiner {

SelectionDAG &DAG;

const TargetLowering &TLI;

+ const SelectionDAGTargetInfo *STI;

CombineLevel Level;

CodeGenOpt::Level OptLevel;

bool LegalDAG = false;

bool LegalOperations = false;

bool LegalTypes = false;

bool ForCodeSize;

+ bool DisableGenericCombines;

/// Worklist of all of the nodes that need to be simplified.

///

@@ -222,9 +235,11 @@ namespace {

public:

DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)

- : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),

- OptLevel(OL), AA(AA) {

+ : DAG(D), TLI(D.getTargetLoweringInfo()),

+ STI(D.getSubtarget().getSelectionDAGInfo()),

+ Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) {

ForCodeSize = DAG.shouldOptForSize();

+ DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel);

MaximumLegalStoreInBits = 0;

// We use the minimum store size here, since that's all we can guarantee

@@ -307,23 +322,34 @@ namespace {

}

bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) {

- EVT VT = Op.getValueType();

- unsigned NumElts = VT.isVector() ? VT.getVectorNumElements() : 1;

- APInt DemandedElts = APInt::getAllOnesValue(NumElts);

- return SimplifyDemandedBits(Op, DemandedBits, DemandedElts);

+ TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);

+ KnownBits Known;

+ if (!TLI.SimplifyDemandedBits(Op, DemandedBits, Known, TLO, 0, false))

+ return false;

+ // Revisit the node.

+ AddToWorklist(Op.getNode());

+ CommitTargetLoweringOpt(TLO);

+ return true;

}

/// Check the specified vector node value to see if it can be simplified or

/// if things it uses can be simplified as it only uses some of the

/// elements. If so, return true.

bool SimplifyDemandedVectorElts(SDValue Op) {

+ // TODO: For now just pretend it cannot be simplified.

+ if (Op.getValueType().isScalableVector())

+ return false;

unsigned NumElts = Op.getValueType().getVectorNumElements();

APInt DemandedElts = APInt::getAllOnesValue(NumElts);

return SimplifyDemandedVectorElts(Op, DemandedElts);

}

bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,

- const APInt &DemandedElts);

+ const APInt &DemandedElts,

+ bool AssumeSingleUse = false);

bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,

bool AssumeSingleUse = false);

@@ -429,11 +455,13 @@ namespace {

SDValue visitZERO_EXTEND(SDNode *N);

SDValue visitANY_EXTEND(SDNode *N);

SDValue visitAssertExt(SDNode *N);

+ SDValue visitAssertAlign(SDNode *N);

SDValue visitSIGN_EXTEND_INREG(SDNode *N);

SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);

SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);

SDValue visitTRUNCATE(SDNode *N);

SDValue visitBITCAST(SDNode *N);

+ SDValue visitFREEZE(SDNode *N);

SDValue visitBUILD_PAIR(SDNode *N);

SDValue visitFADD(SDNode *N);

SDValue visitFSUB(SDNode *N);

@@ -522,9 +550,8 @@ namespace {

SDValue rebuildSetCC(SDValue N);

bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,

- SDValue &CC) const;

+ SDValue &CC, bool MatchStrict = false) const;

bool isOneUseSetCC(SDValue N) const;

- bool isCheaperToUseNegatedFPOps(SDValue X, SDValue Y);

SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,

unsigned HiOp);

@@ -553,6 +580,10 @@ namespace {

SDValue InnerPos, SDValue InnerNeg,

unsigned PosOpcode, unsigned NegOpcode,

const SDLoc &DL);

+ SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,

+ SDValue InnerPos, SDValue InnerNeg,

+ unsigned PosOpcode, unsigned NegOpcode,

+ const SDLoc &DL);

SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);

SDValue MatchLoadCombine(SDNode *N);

SDValue MatchStoreCombine(StoreSDNode *N);

@@ -562,6 +593,7 @@ namespace {

SDValue TransformFPLoadStorePair(SDNode *N);

SDValue convertBuildVecZextToZext(SDNode *N);

SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);

+ SDValue reduceBuildVecTruncToBitCast(SDNode *N);

SDValue reduceBuildVecToShuffle(SDNode *N);

SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,

ArrayRef<int> VectorMask, SDValue VecIn1,

@@ -606,6 +638,19 @@ namespace {

: MemNode(N), OffsetFromBase(Offset) {}

};

+ // Classify the origin of a stored value.

+ enum class StoreSource { Unknown, Constant, Extract, Load };

+ StoreSource getStoreSource(SDValue StoreVal) {

+ if (isa<ConstantSDNode>(StoreVal) || isa<ConstantFPSDNode>(StoreVal))

+ return StoreSource::Constant;

+ if (StoreVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

+ StoreVal.getOpcode() == ISD::EXTRACT_SUBVECTOR)

+ return StoreSource::Extract;

+ if (isa<LoadSDNode>(StoreVal))

+ return StoreSource::Load;

+ return StoreSource::Unknown;

+ }

/// This is a helper function for visitMUL to check the profitability

/// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).

/// MulNode is the original multiply, AddNode is (add x, c1),

@@ -633,43 +678,66 @@ namespace {

/// can be combined into narrow loads.

bool BackwardsPropagateMask(SDNode *N);

- /// Helper function for MergeConsecutiveStores which merges the

- /// component store chains.

+ /// Helper function for mergeConsecutiveStores which merges the component

+ /// store chains.

SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,

unsigned NumStores);

- /// This is a helper function for MergeConsecutiveStores. When the

- /// source elements of the consecutive stores are all constants or

- /// all extracted vector elements, try to merge them into one

- /// larger store introducing bitcasts if necessary. \return True

- /// if a merged store was created.

- bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,

+ /// This is a helper function for mergeConsecutiveStores. When the source

+ /// elements of the consecutive stores are all constants or all extracted

+ /// vector elements, try to merge them into one larger store introducing

+ /// bitcasts if necessary. \return True if a merged store was created.

+ bool mergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,

EVT MemVT, unsigned NumStores,

bool IsConstantSrc, bool UseVector,

bool UseTrunc);

- /// This is a helper function for MergeConsecutiveStores. Stores

- /// that potentially may be merged with St are placed in

- /// StoreNodes. RootNode is a chain predecessor to all store

- /// candidates.

+ /// This is a helper function for mergeConsecutiveStores. Stores that

+ /// potentially may be merged with St are placed in StoreNodes. RootNode is

+ /// a chain predecessor to all store candidates.

void getStoreMergeCandidates(StoreSDNode *St,

SmallVectorImpl<MemOpLink> &StoreNodes,

SDNode *&Root);

- /// Helper function for MergeConsecutiveStores. Checks if

- /// candidate stores have indirect dependency through their

- /// operands. RootNode is the predecessor to all stores calculated

- /// by getStoreMergeCandidates and is used to prune the dependency check.

- /// \return True if safe to merge.

+ /// Helper function for mergeConsecutiveStores. Checks if candidate stores

+ /// have indirect dependency through their operands. RootNode is the

+ /// predecessor to all stores calculated by getStoreMergeCandidates and is

+ /// used to prune the dependency check. \return True if safe to merge.

bool checkMergeStoreCandidatesForDependencies(

SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,

SDNode *RootNode);

+ /// This is a helper function for mergeConsecutiveStores. Given a list of

+ /// store candidates, find the first N that are consecutive in memory.

+ /// Returns 0 if there are not at least 2 consecutive stores to try merging.

+ unsigned getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,

+ int64_t ElementSizeBytes) const;

+ /// This is a helper function for mergeConsecutiveStores. It is used for

+ /// store chains that are composed entirely of constant values.

+ bool tryStoreMergeOfConstants(SmallVectorImpl<MemOpLink> &StoreNodes,

+ unsigned NumConsecutiveStores,

+ EVT MemVT, SDNode *Root, bool AllowVectors);

+ /// This is a helper function for mergeConsecutiveStores. It is used for

+ /// store chains that are composed entirely of extracted vector elements.

+ /// When extracting multiple vector elements, try to store them in one

+ /// vector store rather than a sequence of scalar stores.

+ bool tryStoreMergeOfExtracts(SmallVectorImpl<MemOpLink> &StoreNodes,

+ unsigned NumConsecutiveStores, EVT MemVT,

+ SDNode *Root);

+ /// This is a helper function for mergeConsecutiveStores. It is used for

+ /// store chains that are composed entirely of loaded values.

+ bool tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,

+ unsigned NumConsecutiveStores, EVT MemVT,

+ SDNode *Root, bool AllowVectors,

+ bool IsNonTemporalStore, bool IsNonTemporalLoad);

/// Merge consecutive store operations into a wide store.

/// This optimization uses wide integers or vectors when possible.

- /// \return number of stores that were merged into a merged store (the

- /// affected nodes are stored as a prefix in \p StoreNodes).

- bool MergeConsecutiveStores(StoreSDNode *St);

+ /// \return true if stores were merged.

+ bool mergeConsecutiveStores(StoreSDNode *St);

/// Try to transform a truncation where C is a constant:

/// (trunc (and X, C)) -> (and (trunc X), (trunc C))

@@ -814,7 +882,7 @@ static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {

// the appropriate nodes based on the type of node we are checking. This

// simplifies life a bit for the callers.

bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,

- SDValue &CC) const {

+ SDValue &CC, bool MatchStrict) const {

if (N.getOpcode() == ISD::SETCC) {

LHS = N.getOperand(0);

RHS = N.getOperand(1);

@@ -822,6 +890,15 @@ bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,

return true;

}

+ if (MatchStrict &&

+ (N.getOpcode() == ISD::STRICT_FSETCC ||

+ N.getOpcode() == ISD::STRICT_FSETCCS)) {

+ LHS = N.getOperand(1);

+ RHS = N.getOperand(2);

+ CC = N.getOperand(3);

+ return true;

+ }

if (N.getOpcode() != ISD::SELECT_CC ||

!TLI.isConstTrueVal(N.getOperand(2).getNode()) ||

!TLI.isConstFalseVal(N.getOperand(3).getNode()))

@@ -886,6 +963,13 @@ static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) {

ISD::isBuildVectorOfConstantFPSDNodes(V.getNode());

}

+// Determine if this an indexed load with an opaque target constant index.

+static bool canSplitIdx(LoadSDNode *LD) {

+ return MaySplitLoadIndex &&

+ (LD->getOperand(2).getOpcode() != ISD::TargetConstant ||

+ !cast<ConstantSDNode>(LD->getOperand(2))->isOpaque());

bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,

const SDLoc &DL,

SDValue N0,

@@ -951,14 +1035,11 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,

if (N0.getOpcode() != Opc)

return SDValue();

- // Don't reassociate reductions.

- if (N0->getFlags().hasVectorReduction())

- return SDValue();

- if (SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {

- if (SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N1)) {

+ if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {

+ if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {

// Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))

- if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, C1, C2))

+ if (SDValue OpNode =

+ DAG.FoldConstantArithmetic(Opc, DL, VT, {N0.getOperand(1), N1}))

return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);

return SDValue();

}

@@ -978,9 +1059,6 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,

SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,

SDValue N1, SDNodeFlags Flags) {

assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");

- // Don't reassociate reductions.

- if (Flags.hasVectorReduction())

- return SDValue();

// Floating-point reassociation is not allowed without loose FP math.

if (N0.getValueType().isFloatingPoint() ||

@@ -1029,6 +1107,12 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,

void DAGCombiner::

CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {

+ // Replace the old value with the new one.

+ ++NodesCombined;

+ LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);

+ dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);

+ dbgs() << '\n');

// Replace all uses. If any nodes become isomorphic to other nodes and

// are deleted, make sure to remove them from our worklist.

WorklistRemover DeadNodes(*this);

@@ -1047,21 +1131,17 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {

/// Check the specified integer node value to see if it can be simplified or if

/// things it uses can be simplified by bit propagation. If so, return true.

bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,

- const APInt &DemandedElts) {

+ const APInt &DemandedElts,

+ bool AssumeSingleUse) {

TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);

KnownBits Known;

- if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO))

+ if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0,

+ AssumeSingleUse))

return false;

// Revisit the node.

AddToWorklist(Op.getNode());

- // Replace the old value with the new one.

- ++NodesCombined;

- LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);

- dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);

- dbgs() << '\n');

CommitTargetLoweringOpt(TLO);

return true;

}

@@ -1081,12 +1161,6 @@ bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,

// Revisit the node.

AddToWorklist(Op.getNode());

- // Replace the old value with the new one.

- ++NodesCombined;

- LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);

- dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);

- dbgs() << '\n');

CommitTargetLoweringOpt(TLO);

return true;

}

@@ -1210,8 +1284,11 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {

SDValue RV =

DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));

- // We are always replacing N0/N1's use in N and only need

- // additional replacements if there are additional uses.

+ // We are always replacing N0/N1's use in N and only need additional

+ // replacements if there are additional uses.

+ // Note: We are checking uses of the *nodes* (SDNode) rather than values

+ // (SDValue) here because the node may reference multiple values

+ // (for example, the chain value of a load node).

Replace0 &= !N0->hasOneUse();

Replace1 &= (N0 != N1) && !N1->hasOneUse();

@@ -1561,6 +1638,7 @@ SDValue DAGCombiner::visit(SDNode *N) {

case ISD::ANY_EXTEND: return visitANY_EXTEND(N);

case ISD::AssertSext:

case ISD::AssertZext: return visitAssertExt(N);

+ case ISD::AssertAlign: return visitAssertAlign(N);

case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N);

case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);

case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);

@@ -1610,6 +1688,7 @@ SDValue DAGCombiner::visit(SDNode *N) {

case ISD::LIFETIME_END: return visitLIFETIME_END(N);

case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);

case ISD::FP16_TO_FP: return visitFP16_TO_FP(N);

+ case ISD::FREEZE: return visitFREEZE(N);

case ISD::VECREDUCE_FADD:

case ISD::VECREDUCE_FMUL:

case ISD::VECREDUCE_ADD:

@@ -1628,7 +1707,9 @@ SDValue DAGCombiner::visit(SDNode *N) {

}

SDValue DAGCombiner::combine(SDNode *N) {

- SDValue RV = visit(N);

+ SDValue RV;

+ if (!DisableGenericCombines)

+ RV = visit(N);

// If nothing happened, try a target-specific DAG combine.

if (!RV.getNode()) {

@@ -2046,12 +2127,11 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {

// We need a constant operand for the add/sub, and the other operand is a

// logical shift right: add (srl), C or sub C, (srl).

- // TODO - support non-uniform vector amounts.

bool IsAdd = N->getOpcode() == ISD::ADD;

SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0);

SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1);

- ConstantSDNode *C = isConstOrConstSplat(ConstantOp);

- if (!C || ShiftOp.getOpcode() != ISD::SRL)

+ if (!DAG.isConstantIntBuildVectorOrConstantInt(ConstantOp) ||

+ ShiftOp.getOpcode() != ISD::SRL)

return SDValue();

// The shift must be of a 'not' value.

@@ -2072,8 +2152,11 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {

SDLoc DL(N);

auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL;

SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt);

- APInt NewC = IsAdd ? C->getAPIntValue() + 1 : C->getAPIntValue() - 1;

- return DAG.getNode(ISD::ADD, DL, VT, NewShift, DAG.getConstant(NewC, DL, VT));

+ if (SDValue NewC =

+ DAG.FoldConstantArithmetic(IsAdd ? ISD::ADD : ISD::SUB, DL, VT,

+ {ConstantOp, DAG.getConstant(1, DL, VT)}))

+ return DAG.getNode(ISD::ADD, DL, VT, NewShift, NewC);

+ return SDValue();

}

/// Try to fold a node that behaves like an ADD (note that N isn't necessarily

@@ -2109,8 +2192,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {

if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))

return DAG.getNode(ISD::ADD, DL, VT, N1, N0);

// fold (add c1, c2) -> c1+c2

- return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, N0.getNode(),

- N1.getNode());

+ return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1});

}

// fold (add x, 0) -> x

@@ -2121,8 +2203,8 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {

// fold ((A-c1)+c2) -> (A+(c2-c1))

if (N0.getOpcode() == ISD::SUB &&

isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) {

- SDValue Sub = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N1.getNode(),

- N0.getOperand(1).getNode());

+ SDValue Sub =

+ DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N0.getOperand(1)});

assert(Sub && "Constant folding failed");

return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub);

}

@@ -2130,8 +2212,8 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {

// fold ((c1-A)+c2) -> (c1+c2)-A

if (N0.getOpcode() == ISD::SUB &&

isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) {

- SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, N1.getNode(),

- N0.getOperand(0).getNode());

+ SDValue Add =

+ DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N0.getOperand(0)});

assert(Add && "Constant folding failed");

return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));

}

@@ -2152,13 +2234,14 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {

}

- // Undo the add -> or combine to merge constant offsets from a frame index.

+ // Fold (add (or x, c0), c1) -> (add x, (c0 + c1)) if (or x, c0) is

+ // equivalent to (add x, c0).

if (N0.getOpcode() == ISD::OR &&

- isa<FrameIndexSDNode>(N0.getOperand(0)) &&

- isa<ConstantSDNode>(N0.getOperand(1)) &&

+ isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true) &&

DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {

- SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1));

- return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);

+ if (SDValue Add0 = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT,

+ {N1, N0.getOperand(1)}))

+ return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);

}

@@ -2317,6 +2400,23 @@ SDValue DAGCombiner::visitADD(SDNode *N) {

DAG.haveNoCommonBitsSet(N0, N1))

return DAG.getNode(ISD::OR, DL, VT, N0, N1);

+ // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)).

+ if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) {

+ APInt C0 = N0->getConstantOperandAPInt(0);

+ APInt C1 = N1->getConstantOperandAPInt(0);

+ return DAG.getVScale(DL, VT, C0 + C1);

+ }

+ // fold a+vscale(c1)+vscale(c2) -> a+vscale(c1+c2)

+ if ((N0.getOpcode() == ISD::ADD) &&

+ (N0.getOperand(1).getOpcode() == ISD::VSCALE) &&

+ (N1.getOpcode() == ISD::VSCALE)) {

+ auto VS0 = N0.getOperand(1)->getConstantOperandAPInt(0);

+ auto VS1 = N1->getConstantOperandAPInt(0);

+ auto VS = DAG.getVScale(DL, VT, VS0 + VS1);

+ return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS);

+ }

return SDValue();

}

@@ -2347,8 +2447,7 @@ SDValue DAGCombiner::visitADDSAT(SDNode *N) {

if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))

return DAG.getNode(Opcode, DL, VT, N1, N0);

// fold (add_sat c1, c2) -> c3

- return DAG.FoldConstantArithmetic(Opcode, DL, VT, N0.getNode(),

- N1.getNode());

+ return DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1});

}

// fold (add_sat x, 0) -> x

@@ -2968,12 +3067,10 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {

// FIXME: Refactor this and xor and other similar operations together.

if (N0 == N1)

return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);

- if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&

- DAG.isConstantIntBuildVectorOrConstantInt(N1)) {

- // fold (sub c1, c2) -> c1-c2

- return DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N0.getNode(),

- N1.getNode());

- }

+ // fold (sub c1, c2) -> c3

+ if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1}))

+ return C;

if (SDValue NewSel = foldBinOpIntoSelect(N))

return NewSel;

@@ -3040,8 +3137,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {

if (N0.getOpcode() == ISD::ADD &&

isConstantOrConstantVector(N1, /* NoOpaques */ true) &&

isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {

- SDValue NewC = DAG.FoldConstantArithmetic(

- ISD::SUB, DL, VT, N0.getOperand(1).getNode(), N1.getNode());

+ SDValue NewC =

+ DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(1), N1});

assert(NewC && "Constant folding failed");

return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC);

}

@@ -3051,8 +3148,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {

SDValue N11 = N1.getOperand(1);

if (isConstantOrConstantVector(N0, /* NoOpaques */ true) &&

isConstantOrConstantVector(N11, /* NoOpaques */ true)) {

- SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N0.getNode(),

- N11.getNode());

+ SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11});

assert(NewC && "Constant folding failed");

return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));

}

@@ -3062,8 +3158,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {

if (N0.getOpcode() == ISD::SUB &&

isConstantOrConstantVector(N1, /* NoOpaques */ true) &&

isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {

- SDValue NewC = DAG.FoldConstantArithmetic(

- ISD::ADD, DL, VT, N0.getOperand(1).getNode(), N1.getNode());

+ SDValue NewC =

+ DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0.getOperand(1), N1});

assert(NewC && "Constant folding failed");

return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC);

}

@@ -3072,8 +3168,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {

if (N0.getOpcode() == ISD::SUB &&

isConstantOrConstantVector(N1, /* NoOpaques */ true) &&

isConstantOrConstantVector(N0.getOperand(0), /* NoOpaques */ true)) {

- SDValue NewC = DAG.FoldConstantArithmetic(

- ISD::SUB, DL, VT, N0.getOperand(0).getNode(), N1.getNode());

+ SDValue NewC =

+ DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(0), N1});

assert(NewC && "Constant folding failed");

return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));

}

@@ -3244,6 +3340,12 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {

}

+ // canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C))

+ if (N1.getOpcode() == ISD::VSCALE) {

+ APInt IntVal = N1.getConstantOperandAPInt(0);

+ return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal));

+ }

// Prefer an add for more folding potential and possibly better codegen:

// sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1)

if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) {

@@ -3294,12 +3396,9 @@ SDValue DAGCombiner::visitSUBSAT(SDNode *N) {

if (N0 == N1)

return DAG.getConstant(0, DL, VT);

- if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&

- DAG.isConstantIntBuildVectorOrConstantInt(N1)) {

- // fold (sub_sat c1, c2) -> c3

- return DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, N0.getNode(),

- N1.getNode());

- }

+ // fold (sub_sat c1, c2) -> c3

+ if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1}))

+ return C;

// fold (sub_sat x, 0) -> x

if (isNullConstant(N1))

@@ -3435,30 +3534,20 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {

if (N0.isUndef() || N1.isUndef())

return DAG.getConstant(0, SDLoc(N), VT);

- bool N0IsConst = false;

bool N1IsConst = false;

bool N1IsOpaqueConst = false;

- bool N0IsOpaqueConst = false;

- APInt ConstValue0, ConstValue1;

+ APInt ConstValue1;

// fold vector ops

if (VT.isVector()) {

if (SDValue FoldedVOp = SimplifyVBinOp(N))

return FoldedVOp;

- N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0);

N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);

- assert((!N0IsConst ||

- ConstValue0.getBitWidth() == VT.getScalarSizeInBits()) &&

- "Splat APInt should be element width");

assert((!N1IsConst ||

ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&

"Splat APInt should be element width");

} else {

- N0IsConst = isa<ConstantSDNode>(N0);

- if (N0IsConst) {

- ConstValue0 = cast<ConstantSDNode>(N0)->getAPIntValue();

- N0IsOpaqueConst = cast<ConstantSDNode>(N0)->isOpaque();

- }

N1IsConst = isa<ConstantSDNode>(N1);

if (N1IsConst) {

ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();

@@ -3467,17 +3556,18 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {

}

// fold (mul c1, c2) -> c1*c2

- if (N0IsConst && N1IsConst && !N0IsOpaqueConst && !N1IsOpaqueConst)

- return DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT,

- N0.getNode(), N1.getNode());

+ if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1}))

+ return C;

// canonicalize constant to RHS (vector doesn't have to splat)

if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&

!DAG.isConstantIntBuildVectorOrConstantInt(N1))

return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);

// fold (mul x, 0) -> 0

if (N1IsConst && ConstValue1.isNullValue())

return N1;

// fold (mul x, 1) -> x

if (N1IsConst && ConstValue1.isOneValue())

return N0;

@@ -3491,6 +3581,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {

return DAG.getNode(ISD::SUB, DL, VT,

DAG.getConstant(0, DL, VT), N0);

}

// fold (mul x, (1 << c)) -> x << c

if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&

DAG.isKnownToBeAPowerOfTwo(N1) &&

@@ -3501,6 +3592,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {

SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);

return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);

}

// fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c

if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) {

unsigned Log2Val = (-ConstValue1).logBase2();

@@ -3589,6 +3681,14 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {

DAG.getNode(ISD::MUL, SDLoc(N1), VT,

N0.getOperand(1), N1));

+ // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)).

+ if (N0.getOpcode() == ISD::VSCALE)

+ if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) {

+ APInt C0 = N0.getConstantOperandAPInt(0);

+ APInt C1 = NC1->getAPIntValue();

+ return DAG.getVScale(SDLoc(N), VT, C0 * C1);

+ }

// reassociate mul

if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags()))

return RMUL;

@@ -3746,13 +3846,14 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {

SDLoc DL(N);

// fold (sdiv c1, c2) -> c1/c2

- ConstantSDNode *N0C = isConstOrConstSplat(N0);

ConstantSDNode *N1C = isConstOrConstSplat(N1);

- if (N0C && N1C && !N0C->isOpaque() && !N1C->isOpaque())

- return DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, N0C, N1C);

+ if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1}))

+ return C;

// fold (sdiv X, -1) -> 0-X

if (N1C && N1C->isAllOnesValue())

return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);

// fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)

if (N1C && N1C->getAPIntValue().isMinSignedValue())

return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),

@@ -3890,12 +3991,10 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {

SDLoc DL(N);

// fold (udiv c1, c2) -> c1/c2

- ConstantSDNode *N0C = isConstOrConstSplat(N0);

ConstantSDNode *N1C = isConstOrConstSplat(N1);

- if (N0C && N1C)

- if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT,

- N0C, N1C))

- return Folded;

+ if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1}))

+ return C;

// fold (udiv X, -1) -> select(X == -1, 1, 0)

if (N1C && N1C->getAPIntValue().isAllOnesValue())

return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),

@@ -3988,11 +4087,10 @@ SDValue DAGCombiner::visitREM(SDNode *N) {

SDLoc DL(N);

// fold (rem c1, c2) -> c1%c2

- ConstantSDNode *N0C = isConstOrConstSplat(N0);

ConstantSDNode *N1C = isConstOrConstSplat(N1);

- if (N0C && N1C)

- if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))

- return Folded;

+ if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))

+ return C;

// fold (urem X, -1) -> select(X == -1, 0, x)

if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue())

return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),

@@ -4088,7 +4186,7 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {

// If the type twice as wide is legal, transform the mulhs to a wider multiply

// plus a shift.

- if (VT.isSimple() && !VT.isVector()) {

+ if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {

MVT Simple = VT.getSimpleVT();

unsigned SimpleSize = Simple.getSizeInBits();

EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);

@@ -4144,7 +4242,7 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {

// If the type twice as wide is legal, transform the mulhu to a wider multiply

// plus a shift.

- if (VT.isSimple() && !VT.isVector()) {

+ if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {

MVT Simple = VT.getSimpleVT();

unsigned SimpleSize = Simple.getSizeInBits();

EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);

@@ -4317,6 +4415,7 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

EVT VT = N0.getValueType();

+ unsigned Opcode = N->getOpcode();

// fold vector ops

if (VT.isVector())

@@ -4324,19 +4423,16 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {

return FoldedVOp;

// fold operation with constant operands.

- ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);

- ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);

- if (N0C && N1C)

- return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C);

+ if (SDValue C = DAG.FoldConstantArithmetic(Opcode, SDLoc(N), VT, {N0, N1}))

+ return C;

// canonicalize constant to RHS

if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&

- !DAG.isConstantIntBuildVectorOrConstantInt(N1))

+ !DAG.isConstantIntBuildVectorOrConstantInt(N1))

return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);

// Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.

// Only do this if the current op isn't legal and the flipped is.

- unsigned Opcode = N->getOpcode();

if (!TLI.isOperationLegal(Opcode, VT) &&

(N0.isUndef() || DAG.SignBitIsZero(N0)) &&

(N1.isUndef() || DAG.SignBitIsZero(N1))) {

@@ -4825,11 +4921,16 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,

return false;

// Ensure that this isn't going to produce an unsupported memory access.

- if (ShAmt &&

- !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,

- LDST->getAddressSpace(), ShAmt / 8,

- LDST->getMemOperand()->getFlags()))

- return false;

+ if (ShAmt) {

+ assert(ShAmt % 8 == 0 && "ShAmt is byte offset");

+ const unsigned ByteShAmt = ShAmt / 8;

+ const Align LDSTAlign = LDST->getAlign();

+ const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt);

+ if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,

+ LDST->getAddressSpace(), NarrowAlign,

+ LDST->getMemOperand()->getFlags()))

+ return false;

+ }

// It's not possible to generate a constant of extended or untyped type.

EVT PtrType = LDST->getBasePtr().getValueType();

@@ -5174,17 +5275,19 @@ SDValue DAGCombiner::visitAND(SDNode *N) {

}

// fold (and c1, c2) -> c1&c2

- ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);

ConstantSDNode *N1C = isConstOrConstSplat(N1);

- if (N0C && N1C && !N1C->isOpaque())

- return DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, N0C, N1C);

+ if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))

+ return C;

// canonicalize constant to RHS

if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&

!DAG.isConstantIntBuildVectorOrConstantInt(N1))

return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);

// fold (and x, -1) -> x

if (isAllOnesConstant(N1))

return N0;

// if (and x, c) is known to be zero, return 0

unsigned BitWidth = VT.getScalarSizeInBits();

if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),

@@ -5654,6 +5757,48 @@ static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) {

return false;

}

+// Match this pattern:

+// (or (and (shl (A, 8)), 0xff00ff00), (and (srl (A, 8)), 0x00ff00ff))

+// And rewrite this to:

+// (rotr (bswap A), 16)

+static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI,

+ SelectionDAG &DAG, SDNode *N, SDValue N0,

+ SDValue N1, EVT VT, EVT ShiftAmountTy) {

+ assert(N->getOpcode() == ISD::OR && VT == MVT::i32 &&

+ "MatchBSwapHWordOrAndAnd: expecting i32");

+ if (!TLI.isOperationLegalOrCustom(ISD::ROTR, VT))

+ return SDValue();

+ if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)

+ return SDValue();

+ // TODO: this is too restrictive; lifting this restriction requires more tests

+ if (!N0->hasOneUse() || !N1->hasOneUse())

+ return SDValue();

+ ConstantSDNode *Mask0 = isConstOrConstSplat(N0.getOperand(1));

+ ConstantSDNode *Mask1 = isConstOrConstSplat(N1.getOperand(1));

+ if (!Mask0 || !Mask1)

+ return SDValue();

+ if (Mask0->getAPIntValue() != 0xff00ff00 ||

+ Mask1->getAPIntValue() != 0x00ff00ff)

+ return SDValue();

+ SDValue Shift0 = N0.getOperand(0);

+ SDValue Shift1 = N1.getOperand(0);

+ if (Shift0.getOpcode() != ISD::SHL || Shift1.getOpcode() != ISD::SRL)

+ return SDValue();

+ ConstantSDNode *ShiftAmt0 = isConstOrConstSplat(Shift0.getOperand(1));

+ ConstantSDNode *ShiftAmt1 = isConstOrConstSplat(Shift1.getOperand(1));

+ if (!ShiftAmt0 || !ShiftAmt1)

+ return SDValue();

+ if (ShiftAmt0->getAPIntValue() != 8 || ShiftAmt1->getAPIntValue() != 8)

+ return SDValue();

+ if (Shift0.getOperand(0) != Shift1.getOperand(0))

+ return SDValue();

+ SDLoc DL(N);

+ SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Shift0.getOperand(0));

+ SDValue ShAmt = DAG.getConstant(16, DL, ShiftAmountTy);

+ return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);

/// Match a 32-bit packed halfword bswap. That is

/// ((x & 0x000000ff) << 8) |

/// ((x & 0x0000ff00) >> 8) |

@@ -5670,6 +5815,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {

if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))

return SDValue();

+ if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT,

+ getShiftAmountTy(VT)))

+ return BSwap;

+ // Try again with commuted operands.

+ if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT,

+ getShiftAmountTy(VT)))

+ return BSwap;

// Look for either

// (or (bswaphpair), (bswaphpair))

// (or (or (bswaphpair), (and)), (and))

@@ -5875,17 +6030,19 @@ SDValue DAGCombiner::visitOR(SDNode *N) {

}

// fold (or c1, c2) -> c1|c2

- ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

- if (N0C && N1C && !N1C->isOpaque())

- return DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, N0C, N1C);

+ if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1}))

+ return C;

// canonicalize constant to RHS

if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&

!DAG.isConstantIntBuildVectorOrConstantInt(N1))

return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);

// fold (or x, 0) -> x

if (isNullConstant(N1))

return N0;

// fold (or x, -1) -> -1

if (isAllOnesConstant(N1))

return N1;

@@ -5920,8 +6077,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {

};

if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&

ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) {

- if (SDValue COR = DAG.FoldConstantArithmetic(

- ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) {

+ if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,

+ {N1, N0.getOperand(1)})) {

SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);

AddToWorklist(IOR.getNode());

return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);

@@ -6020,6 +6177,7 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,

ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));

// (add v v) -> (shl v 1)

+ // TODO: Should this be a general DAG canonicalization?

if (OppShift.getOpcode() == ISD::SRL && OppShiftCst &&

ExtractFrom.getOpcode() == ISD::ADD &&

ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) &&

@@ -6192,8 +6350,12 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,

// EltSize & Mask == NegC & Mask

// (because "x & Mask" is a truncation and distributes through subtraction).

+ //

+ // We also need to account for a potential truncation of NegOp1 if the amount

+ // has already been legalized to a shift amount type.

APInt Width;

- if (Pos == NegOp1)

+ if ((Pos == NegOp1) ||

+ (NegOp1.getOpcode() == ISD::TRUNCATE && Pos == NegOp1.getOperand(0)))

Width = NegC->getAPIntValue();

// Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.

@@ -6246,19 +6408,91 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,

return SDValue();

}

+// A subroutine of MatchRotate used once we have found an OR of two opposite

+// shifts of N0 + N1. If Neg == <operand size> - Pos then the OR reduces

+// to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the

+// former being preferred if supported. InnerPos and InnerNeg are Pos and

+// Neg with outer conversions stripped away.

+// TODO: Merge with MatchRotatePosNeg.

+SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,

+ SDValue Neg, SDValue InnerPos,

+ SDValue InnerNeg, unsigned PosOpcode,

+ unsigned NegOpcode, const SDLoc &DL) {

+ EVT VT = N0.getValueType();

+ unsigned EltBits = VT.getScalarSizeInBits();

+ // fold (or (shl x0, (*ext y)),

+ // (srl x1, (*ext (sub 32, y)))) ->

+ // (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))

+ //

+ // fold (or (shl x0, (*ext (sub 32, y))),

+ // (srl x1, (*ext y))) ->

+ // (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))

+ if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG)) {

+ bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);

+ return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,

+ HasPos ? Pos : Neg);

+ }

+ // Matching the shift+xor cases, we can't easily use the xor'd shift amount

+ // so for now just use the PosOpcode case if its legal.

+ // TODO: When can we use the NegOpcode case?

+ if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) {

+ auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) {

+ if (Op.getOpcode() != BinOpc)

+ return false;

+ ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1));

+ return Cst && (Cst->getAPIntValue() == Imm);

+ };

+ // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))

+ // -> (fshl x0, x1, y)

+ if (IsBinOpImm(N1, ISD::SRL, 1) &&

+ IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) &&

+ InnerPos == InnerNeg.getOperand(0) &&

+ TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) {

+ return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos);

+ }

+ // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))

+ // -> (fshr x0, x1, y)

+ if (IsBinOpImm(N0, ISD::SHL, 1) &&

+ IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&

+ InnerNeg == InnerPos.getOperand(0) &&

+ TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {

+ return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);

+ }

+ // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))

+ // -> (fshr x0, x1, y)

+ // TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?

+ if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) &&

+ IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&

+ InnerNeg == InnerPos.getOperand(0) &&

+ TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {

+ return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);

+ }

+ return SDValue();

// MatchRotate - Handle an 'or' of two operands. If this is one of the many

// idioms for rotate, and if the target supports rotation instructions, generate

-// a rot[lr].

+// a rot[lr]. This also matches funnel shift patterns, similar to rotation but

+// with different shifted sources.

SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {

// Must be a legal type. Expanded 'n promoted things won't work with rotates.

EVT VT = LHS.getValueType();

if (!TLI.isTypeLegal(VT))

return SDValue();

- // The target must have at least one rotate flavor.

+ // The target must have at least one rotate/funnel flavor.

bool HasROTL = hasOperation(ISD::ROTL, VT);

bool HasROTR = hasOperation(ISD::ROTR, VT);

- if (!HasROTL && !HasROTR)

+ bool HasFSHL = hasOperation(ISD::FSHL, VT);

+ bool HasFSHR = hasOperation(ISD::FSHR, VT);

+ if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR)

return SDValue();

// Check for truncated rotate.

@@ -6308,12 +6542,13 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {

// At this point we've matched or extracted a shift op on each side.

- if (LHSShift.getOperand(0) != RHSShift.getOperand(0))

- return SDValue(); // Not shifting the same value.

if (LHSShift.getOpcode() == RHSShift.getOpcode())

return SDValue(); // Shifts must disagree.

+ bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);

+ if (!IsRotate && !(HasFSHL || HasFSHR))

+ return SDValue(); // Requires funnel shift support.

// Canonicalize shl to left side in a shl/srl pair.

if (RHSShift.getOpcode() == ISD::SHL) {

std::swap(LHS, RHS);

@@ -6329,13 +6564,21 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {

// fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)

// fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)

+ // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)

+ // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)

+ // iff C1+C2 == EltSizeInBits

auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,

ConstantSDNode *RHS) {

return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;

};

if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {

- SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,

- LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);

+ SDValue Res;

+ if (IsRotate && (HasROTL || HasROTR))

+ Res = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,

+ HasROTL ? LHSShiftAmt : RHSShiftAmt);

+ else

+ Res = DAG.getNode(HasFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,

+ RHSShiftArg, HasFSHL ? LHSShiftAmt : RHSShiftAmt);

// If there is an AND of either shifted operand, apply it to the result.

if (LHSMask.getNode() || RHSMask.getNode()) {

@@ -6353,10 +6596,10 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {

DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));

}

- Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);

+ Res = DAG.getNode(ISD::AND, DL, VT, Res, Mask);

}

- return Rot;

+ return Res;

}

// If there is a mask here, and we have a variable shift, we can't be sure

@@ -6379,13 +6622,29 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {

RExtOp0 = RHSShiftAmt.getOperand(0);

}

- SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,

- LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);

+ if (IsRotate && (HasROTL || HasROTR)) {

+ SDValue TryL =

+ MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,

+ RExtOp0, ISD::ROTL, ISD::ROTR, DL);

+ if (TryL)

+ return TryL;

+ SDValue TryR =

+ MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,

+ LExtOp0, ISD::ROTR, ISD::ROTL, DL);

+ if (TryR)

+ return TryR;

+ }

+ SDValue TryL =

+ MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,

+ LExtOp0, RExtOp0, ISD::FSHL, ISD::FSHR, DL);

if (TryL)

return TryL;

- SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,

- RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);

+ SDValue TryR =

+ MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,

+ RExtOp0, LExtOp0, ISD::FSHR, ISD::FSHL, DL);

if (TryR)

return TryR;

@@ -6610,9 +6869,9 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {

if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT))

return SDValue();

- // Check if all the bytes of the combined value we are looking at are stored

- // to the same base address. Collect bytes offsets from Base address into

- // ByteOffsets.

+ // Check if all the bytes of the combined value we are looking at are stored

+ // to the same base address. Collect bytes offsets from Base address into

+ // ByteOffsets.

SDValue CombinedValue;

SmallVector<int64_t, 8> ByteOffsets(Width, INT64_MAX);

int64_t FirstOffset = INT64_MAX;

@@ -6630,17 +6889,16 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {

SDValue Value = Trunc.getOperand(0);

if (Value.getOpcode() == ISD::SRL ||

Value.getOpcode() == ISD::SRA) {

- ConstantSDNode *ShiftOffset =

- dyn_cast<ConstantSDNode>(Value.getOperand(1));

- // Trying to match the following pattern. The shift offset must be

+ auto *ShiftOffset = dyn_cast<ConstantSDNode>(Value.getOperand(1));

+ // Trying to match the following pattern. The shift offset must be

// a constant and a multiple of 8. It is the byte offset in "y".

- //

+ //

// x = srl y, offset

- // i8 z = trunc x

+ // i8 z = trunc x

// store z, ...

if (!ShiftOffset || (ShiftOffset->getSExtValue() % 8))

return SDValue();

Offset = ShiftOffset->getSExtValue()/8;

Value = Value.getOperand(0);

}

@@ -6685,7 +6943,7 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {

assert(FirstOffset != INT64_MAX && "First byte offset must be set");

assert(FirstStore && "First store must be set");

- // Check if the bytes of the combined value we are looking at match with

+ // Check if the bytes of the combined value we are looking at match with

// either big or little endian value store.

Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset);

if (!IsBigEndian.hasValue())

@@ -7030,20 +7288,22 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {

SDLoc DL(N);

if (N0.isUndef() && N1.isUndef())

return DAG.getConstant(0, DL, VT);

// fold (xor x, undef) -> undef

if (N0.isUndef())

return N0;

if (N1.isUndef())

return N1;

// fold (xor c1, c2) -> c1^c2

- ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);

- ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);

- if (N0C && N1C)

- return DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, N0C, N1C);

+ if (SDValue C = DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, {N0, N1}))

+ return C;

// canonicalize constant to RHS

if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&

!DAG.isConstantIntBuildVectorOrConstantInt(N1))

return DAG.getNode(ISD::XOR, DL, VT, N1, N0);

// fold (xor x, 0) -> x

if (isNullConstant(N1))

return N0;

@@ -7058,7 +7318,8 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {

// fold !(x cc y) -> (x !cc y)

unsigned N0Opcode = N0.getOpcode();

SDValue LHS, RHS, CC;

- if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {

+ if (TLI.isConstTrueVal(N1.getNode()) &&

+ isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/true)) {

ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),

LHS.getValueType());

if (!LegalOperations ||

@@ -7071,6 +7332,21 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {

case ISD::SELECT_CC:

return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),

N0.getOperand(3), NotCC);

+ case ISD::STRICT_FSETCC:

+ case ISD::STRICT_FSETCCS: {

+ if (N0.hasOneUse()) {

+ // FIXME Can we handle multiple uses? Could we token factor the chain

+ // results from the new/old setcc?

+ SDValue SetCC = DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,

+ N0.getOperand(0),

+ N0Opcode == ISD::STRICT_FSETCCS);

+ CombineTo(N, SetCC);

+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1));

+ recursivelyDeleteUnusedNodes(N0.getNode());

+ return SDValue(N, 0); // Return N so it doesn't get rechecked!

+ }

+ break;

+ }

}

@@ -7405,15 +7681,29 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {

}

// fold (rot x, c) -> (rot x, c % BitSize)

- // TODO - support non-uniform vector amounts.

- if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) {

- if (Cst->getAPIntValue().uge(Bitsize)) {

- uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize);

- return DAG.getNode(N->getOpcode(), dl, VT, N0,

- DAG.getConstant(RotAmt, dl, N1.getValueType()));

- }

+ bool OutOfRange = false;

+ auto MatchOutOfRange = [Bitsize, &OutOfRange](ConstantSDNode *C) {

+ OutOfRange |= C->getAPIntValue().uge(Bitsize);

+ return true;

+ };

+ if (ISD::matchUnaryPredicate(N1, MatchOutOfRange) && OutOfRange) {

+ EVT AmtVT = N1.getValueType();

+ SDValue Bits = DAG.getConstant(Bitsize, dl, AmtVT);

+ if (SDValue Amt =

+ DAG.FoldConstantArithmetic(ISD::UREM, dl, AmtVT, {N1, Bits}))

+ return DAG.getNode(N->getOpcode(), dl, VT, N0, Amt);

}

+ // rot i16 X, 8 --> bswap X

+ auto *RotAmtC = isConstOrConstSplat(N1);

+ if (RotAmtC && RotAmtC->getAPIntValue() == 8 &&

+ VT.getScalarSizeInBits() == 16 && hasOperation(ISD::BSWAP, VT))

+ return DAG.getNode(ISD::BSWAP, dl, VT, N0);

+ // Simplify the operands using demanded-bits information.

+ if (SimplifyDemandedBits(SDValue(N, 0)))

+ return SDValue(N, 0);

// fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).

if (N1.getOpcode() == ISD::TRUNCATE &&

N1.getOperand(0).getOpcode() == ISD::AND) {

@@ -7430,12 +7720,11 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {

EVT ShiftVT = C1->getValueType(0);

bool SameSide = (N->getOpcode() == NextOp);

unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;

- if (SDValue CombinedShift =

- DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) {

+ if (SDValue CombinedShift = DAG.FoldConstantArithmetic(

+ CombineOp, dl, ShiftVT, {N1, N0.getOperand(1)})) {

SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);

SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(

- ISD::SREM, dl, ShiftVT, CombinedShift.getNode(),

- BitsizeC.getNode());

+ ISD::SREM, dl, ShiftVT, {CombinedShift, BitsizeC});

return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),

CombinedShiftNorm);

}

@@ -7471,8 +7760,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {

if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&

TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==

TargetLowering::ZeroOrNegativeOneBooleanContent) {

- if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT,

- N01CV, N1CV))

+ if (SDValue C =

+ DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1}))

return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);

}

@@ -7482,10 +7771,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {

ConstantSDNode *N1C = isConstOrConstSplat(N1);

// fold (shl c1, c2) -> c1<<c2

- // TODO - support non-uniform vector shift amounts.

- ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);

- if (N0C && N1C && !N1C->isOpaque())

- return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);

+ if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1}))

+ return C;

if (SDValue NewSel = foldBinOpIntoSelect(N))

return NewSel;

@@ -7502,8 +7789,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {

return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);

}

- // TODO - support non-uniform vector shift amounts.

- if (N1C && SimplifyDemandedBits(SDValue(N, 0)))

+ if (SimplifyDemandedBits(SDValue(N, 0)))

return SDValue(N, 0);

// fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))

@@ -7691,9 +7977,90 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {

if (SDValue NewSHL = visitShiftByConstant(N))

return NewSHL;

+ // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)).

+ if (N0.getOpcode() == ISD::VSCALE)

+ if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) {

+ auto DL = SDLoc(N);

+ APInt C0 = N0.getConstantOperandAPInt(0);

+ APInt C1 = NC1->getAPIntValue();

+ return DAG.getVScale(DL, VT, C0 << C1);

+ }

return SDValue();

}

+// Transform a right shift of a multiply into a multiply-high.

+// Examples:

+// (srl (mul (zext i32:$a to i64), (zext i32:$a to i64)), 32) -> (mulhu $a, $b)

+// (sra (mul (sext i32:$a to i64), (sext i32:$a to i64)), 32) -> (mulhs $a, $b)

+static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,

+ const TargetLowering &TLI) {

+ assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&

+ "SRL or SRA node is required here!");

+ // Check the shift amount. Proceed with the transformation if the shift

+ // amount is constant.

+ ConstantSDNode *ShiftAmtSrc = isConstOrConstSplat(N->getOperand(1));

+ if (!ShiftAmtSrc)

+ return SDValue();

+ SDLoc DL(N);

+ // The operation feeding into the shift must be a multiply.

+ SDValue ShiftOperand = N->getOperand(0);

+ if (ShiftOperand.getOpcode() != ISD::MUL)

+ return SDValue();

+ // Both operands must be equivalent extend nodes.

+ SDValue LeftOp = ShiftOperand.getOperand(0);

+ SDValue RightOp = ShiftOperand.getOperand(1);

+ bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;

+ bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;

+ if ((!(IsSignExt || IsZeroExt)) || LeftOp.getOpcode() != RightOp.getOpcode())

+ return SDValue();

+ EVT WideVT1 = LeftOp.getValueType();

+ EVT WideVT2 = RightOp.getValueType();

+ (void)WideVT2;

+ // Proceed with the transformation if the wide types match.

+ assert((WideVT1 == WideVT2) &&

+ "Cannot have a multiply node with two different operand types.");

+ EVT NarrowVT = LeftOp.getOperand(0).getValueType();

+ // Check that the two extend nodes are the same type.

+ if (NarrowVT != RightOp.getOperand(0).getValueType())

+ return SDValue();

+ // Only transform into mulh if mulh for the narrow type is cheaper than

+ // a multiply followed by a shift. This should also check if mulh is

+ // legal for NarrowVT on the target.

+ if (!TLI.isMulhCheaperThanMulShift(NarrowVT))

+ return SDValue();

+ // Proceed with the transformation if the wide type is twice as large

+ // as the narrow type.

+ unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();

+ if (WideVT1.getScalarSizeInBits() != 2 * NarrowVTSize)

+ return SDValue();

+ // Check the shift amount with the narrow type size.

+ // Proceed with the transformation if the shift amount is the width

+ // of the narrow type.

+ unsigned ShiftAmt = ShiftAmtSrc->getZExtValue();

+ if (ShiftAmt != NarrowVTSize)

+ return SDValue();

+ // If the operation feeding into the MUL is a sign extend (sext),

+ // we use mulhs. Othewise, zero extends (zext) use mulhu.

+ unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU;

+ SDValue Result = DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0),

+ RightOp.getOperand(0));

+ return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT1)

+ : DAG.getZExtOrTrunc(Result, DL, WideVT1));

SDValue DAGCombiner::visitSRA(SDNode *N) {

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

@@ -7717,10 +8084,8 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {

ConstantSDNode *N1C = isConstOrConstSplat(N1);

// fold (sra c1, c2) -> (sra c1, c2)

- // TODO - support non-uniform vector shift amounts.

- ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);

- if (N0C && N1C && !N1C->isOpaque())

- return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);

+ if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1}))

+ return C;

if (SDValue NewSel = foldBinOpIntoSelect(N))

return NewSel;

@@ -7811,7 +8176,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {

// We convert trunc/ext to opposing shifts in IR, but casts may be cheaper.

// sra (add (shl X, N1C), AddC), N1C -->

// sext (add (trunc X to (width - N1C)), AddC')

- if (!LegalTypes && N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C &&

+ if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C &&

N0.getOperand(0).getOpcode() == ISD::SHL &&

N0.getOperand(0).getOperand(1) == N1 && N0.getOperand(0).hasOneUse()) {

if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) {

@@ -7828,7 +8193,8 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {

// implementation and/or target-specific overrides (because

// non-simple types likely require masking when legalized), but that

// restriction may conflict with other transforms.

- if (TruncVT.isSimple() && TLI.isTruncateFree(VT, TruncVT)) {

+ if (TruncVT.isSimple() && isTypeLegal(TruncVT) &&

+ TLI.isTruncateFree(VT, TruncVT)) {

SDLoc DL(N);

SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT);

SDValue ShiftC = DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).

@@ -7871,8 +8237,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {

}

// Simplify, based on bits shifted out of the LHS.

- // TODO - support non-uniform vector shift amounts.

- if (N1C && SimplifyDemandedBits(SDValue(N, 0)))

+ if (SimplifyDemandedBits(SDValue(N, 0)))

return SDValue(N, 0);

// If the sign bit is known to be zero, switch this to a SRL.

@@ -7883,6 +8248,11 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {

if (SDValue NewSRA = visitShiftByConstant(N))

return NewSRA;

+ // Try to transform this shift into a multiply-high if

+ // it matches the appropriate pattern detected in combineShiftToMULH.

+ if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))

+ return MULH;

return SDValue();

}

@@ -7903,10 +8273,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {

ConstantSDNode *N1C = isConstOrConstSplat(N1);

// fold (srl c1, c2) -> c1 >>u c2

- // TODO - support non-uniform vector shift amounts.

- ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);

- if (N0C && N1C && !N1C->isOpaque())

- return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);

+ if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1}))

+ return C;

if (SDValue NewSel = foldBinOpIntoSelect(N))

return NewSel;

@@ -8070,8 +8438,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {

// fold operands of srl based on knowledge that the low bits are not

// demanded.

- // TODO - support non-uniform vector shift amounts.

- if (N1C && SimplifyDemandedBits(SDValue(N, 0)))

+ if (SimplifyDemandedBits(SDValue(N, 0)))

return SDValue(N, 0);

if (N1C && !N1C->isOpaque())

@@ -8111,6 +8478,11 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {

}

+ // Try to transform this shift into a multiply-high if

+ // it matches the appropriate pattern detected in combineShiftToMULH.

+ if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))

+ return MULH;

return SDValue();

}

@@ -8160,6 +8532,45 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {

return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,

DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,

SDLoc(N), ShAmtTy));

+ // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.

+ // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.

+ // TODO - bigendian support once we have test coverage.

+ // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?

+ // TODO - permit LHS EXTLOAD if extensions are shifted out.

+ if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&

+ !DAG.getDataLayout().isBigEndian()) {

+ auto *LHS = dyn_cast<LoadSDNode>(N0);

+ auto *RHS = dyn_cast<LoadSDNode>(N1);

+ if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&

+ LHS->getAddressSpace() == RHS->getAddressSpace() &&

+ (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&

+ ISD::isNON_EXTLoad(LHS)) {

+ if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {

+ SDLoc DL(RHS);

+ uint64_t PtrOff =

+ IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8);

+ Align NewAlign = commonAlignment(RHS->getAlign(), PtrOff);

+ bool Fast = false;

+ if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

+ RHS->getAddressSpace(), NewAlign,

+ RHS->getMemOperand()->getFlags(), &Fast) &&

+ Fast) {

+ SDValue NewPtr =

+ DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL);

+ AddToWorklist(NewPtr.getNode());

+ SDValue Load = DAG.getLoad(

+ VT, DL, RHS->getChain(), NewPtr,

+ RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,

+ RHS->getMemOperand()->getFlags(), RHS->getAAInfo());

+ // Replace the old load's chain with the new load's chain.

+ WorklistRemover DeadNodes(*this);

+ DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));

+ return Load;

+ }

}

// fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2)

@@ -8609,7 +9020,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {

// Create the actual or node if we can generate good code for it.

if (!normalizeToSequence) {

SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);

- return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1,

+ return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1,

N2_2, Flags);

}

// Otherwise see if we can optimize to a better pattern.

@@ -8825,6 +9236,8 @@ SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {

SDValue N2Elt = N2.getOperand(i);

if (N1Elt.isUndef() || N2Elt.isUndef())

continue;

+ if (N1Elt.getValueType() != N2Elt.getValueType())

+ continue;

const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();

const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();

@@ -9395,8 +9808,7 @@ SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {

SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,

N1.getOperand(1));

- APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();

- Mask = Mask.zext(VT.getSizeInBits());

+ APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());

SDLoc DL0(N0);

SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,

DAG.getConstant(Mask, DL0, VT));

@@ -9702,8 +10114,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {

LN00->getChain(), LN00->getBasePtr(),

LN00->getMemoryVT(),

LN00->getMemOperand());

- APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();

- Mask = Mask.sext(VT.getSizeInBits());

+ APInt Mask = N0.getConstantOperandAPInt(1).sext(VT.getSizeInBits());

SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,

ExtLoad, DAG.getConstant(Mask, DL, VT));

ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND);

@@ -9941,7 +10352,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {

if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&

TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {

SDValue Op = N0.getOperand(0);

- Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());

+ Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);

AddToWorklist(Op.getNode());

SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT);

// Transfer the debug info; the new node is equivalent to N0.

@@ -9953,7 +10364,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {

if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {

SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);

AddToWorklist(Op.getNode());

- SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());

+ SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);

// We may safely transfer the debug info describing the truncate node over

// to the equivalent and operation.

DAG.transferDbgValues(N0, And);

@@ -9971,8 +10382,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {

!TLI.isZExtFree(N0.getValueType(), VT))) {

SDValue X = N0.getOperand(0).getOperand(0);

X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);

- APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();

- Mask = Mask.zext(VT.getSizeInBits());

+ APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());

SDLoc DL(N);

return DAG.getNode(ISD::AND, DL, VT,

X, DAG.getConstant(Mask, DL, VT));

@@ -10026,8 +10436,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {

LN00->getChain(), LN00->getBasePtr(),

LN00->getMemoryVT(),

LN00->getMemOperand());

- APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();

- Mask = Mask.zext(VT.getSizeInBits());

+ APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());

SDLoc DL(N);

SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,

ExtLoad, DAG.getConstant(Mask, DL, VT));

@@ -10080,23 +10489,22 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {

// that the element size of the sext'd result matches the element size of

// the compare operands.

SDLoc DL(N);

- SDValue VecOnes = DAG.getConstant(1, DL, VT);

if (VT.getSizeInBits() == N00VT.getSizeInBits()) {

- // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors.

+ // zext(setcc) -> zext_in_reg(vsetcc) for vectors.

SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),

N0.getOperand(1), N0.getOperand(2));

- return DAG.getNode(ISD::AND, DL, VT, VSetCC, VecOnes);

+ return DAG.getZeroExtendInReg(VSetCC, DL, N0.getValueType());

}

// If the desired elements are smaller or larger than the source

// elements we can use a matching integer vector type and then

- // truncate/sign extend.

+ // truncate/any extend followed by zext_in_reg.

EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();

SDValue VsetCC =

DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),

N0.getOperand(1), N0.getOperand(2));

- return DAG.getNode(ISD::AND, DL, VT, DAG.getSExtOrTrunc(VsetCC, DL, VT),

- VecOnes);

+ return DAG.getZeroExtendInReg(DAG.getAnyExtOrTrunc(VsetCC, DL, VT), DL,

+ N0.getValueType());

}

// zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc

@@ -10127,7 +10535,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {

SDLoc DL(N);

// Ensure that the shift amount is wide enough for the shifted value.

- if (VT.getSizeInBits() >= 256)

+ if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits())

ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);

return DAG.getNode(N0.getOpcode(), DL, VT,

@@ -10187,8 +10595,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {

SDLoc DL(N);

SDValue X = N0.getOperand(0).getOperand(0);

X = DAG.getAnyExtOrTrunc(X, DL, VT);

- APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();

- Mask = Mask.zext(VT.getSizeInBits());

+ APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());

return DAG.getNode(ISD::AND, DL, VT,

X, DAG.getConstant(Mask, DL, VT));

}

@@ -10348,6 +10755,45 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {

return SDValue();

}

+SDValue DAGCombiner::visitAssertAlign(SDNode *N) {

+ SDLoc DL(N);

+ Align AL = cast<AssertAlignSDNode>(N)->getAlign();

+ SDValue N0 = N->getOperand(0);

+ // Fold (assertalign (assertalign x, AL0), AL1) ->

+ // (assertalign x, max(AL0, AL1))

+ if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0))

+ return DAG.getAssertAlign(DL, N0.getOperand(0),

+ std::max(AL, AAN->getAlign()));

+ // In rare cases, there are trivial arithmetic ops in source operands. Sink

+ // this assert down to source operands so that those arithmetic ops could be

+ // exposed to the DAG combining.

+ switch (N0.getOpcode()) {

+ default:

+ break;

+ case ISD::ADD:

+ case ISD::SUB: {

+ unsigned AlignShift = Log2(AL);

+ SDValue LHS = N0.getOperand(0);

+ SDValue RHS = N0.getOperand(1);

+ unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros();

+ unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros();

+ if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) {

+ if (LHSAlignShift < AlignShift)

+ LHS = DAG.getAssertAlign(DL, LHS, AL);

+ if (RHSAlignShift < AlignShift)

+ RHS = DAG.getAssertAlign(DL, RHS, AL);

+ return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS);

+ }

+ break;

+ }

+ return SDValue();

/// If the result of a wider load is shifted to right of N bits and then

/// truncated to a narrower type and where N is a multiple of number of bits of

/// the narrower type, transform it to a narrower load from address + N / num of

@@ -10428,9 +10874,8 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {

}

// At this point, we must have a load or else we can't do the transform.

- if (!isa<LoadSDNode>(N0)) return SDValue();

- auto *LN0 = cast<LoadSDNode>(N0);

+ auto *LN0 = dyn_cast<LoadSDNode>(N0);

+ if (!LN0) return SDValue();

// Because a SRL must be assumed to *need* to zero-extend the high bits

// (as opposed to anyext the high bits), we can't combine the zextload

@@ -10449,8 +10894,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {

SDNode *Mask = *(SRL->use_begin());

if (Mask->getOpcode() == ISD::AND &&

isa<ConstantSDNode>(Mask->getOperand(1))) {

- const APInt &ShiftMask =

- cast<ConstantSDNode>(Mask->getOperand(1))->getAPIntValue();

+ const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);

if (ShiftMask.isMask()) {

EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),

ShiftMask.countTrailingOnes());

@@ -10480,7 +10924,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {

LoadSDNode *LN0 = cast<LoadSDNode>(N0);

// Reducing the width of a volatile load is illegal. For atomics, we may be

- // able to reduce the width provided we never widen again. (see D66309)

+ // able to reduce the width provided we never widen again. (see D66309)

if (!LN0->isSimple() ||

!isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))

return SDValue();

@@ -10561,26 +11005,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

EVT VT = N->getValueType(0);

- EVT EVT = cast<VTSDNode>(N1)->getVT();

+ EVT ExtVT = cast<VTSDNode>(N1)->getVT();

unsigned VTBits = VT.getScalarSizeInBits();

- unsigned EVTBits = EVT.getScalarSizeInBits();

+ unsigned ExtVTBits = ExtVT.getScalarSizeInBits();

+ // sext_vector_inreg(undef) = 0 because the top bit will all be the same.

if (N0.isUndef())

- return DAG.getUNDEF(VT);

+ return DAG.getConstant(0, SDLoc(N), VT);

// fold (sext_in_reg c1) -> c1

if (DAG.isConstantIntBuildVectorOrConstantInt(N0))

return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);

// If the input is already sign extended, just drop the extension.

- if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)

+ if (DAG.ComputeNumSignBits(N0) >= (VTBits - ExtVTBits + 1))

return N0;

// fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2

if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&

- EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))

- return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,

- N0.getOperand(0), N1);

+ ExtVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))

+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0),

+ N1);

// fold (sext_in_reg (sext x)) -> (sext x)

// fold (sext_in_reg (aext x)) -> (sext x)

@@ -10589,8 +11034,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {

if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {

SDValue N00 = N0.getOperand(0);

unsigned N00Bits = N00.getScalarValueSizeInBits();

- if ((N00Bits <= EVTBits ||

- (N00Bits - DAG.ComputeNumSignBits(N00)) < EVTBits) &&

+ if ((N00Bits <= ExtVTBits ||

+ (N00Bits - DAG.ComputeNumSignBits(N00)) < ExtVTBits) &&

(!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))

return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);

}

@@ -10599,7 +11044,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {

if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||

N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ||

N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&

- N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) {

+ N0.getOperand(0).getScalarValueSizeInBits() == ExtVTBits) {

if (!LegalOperations ||

TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))

return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT,

@@ -10610,14 +11055,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {

// iff we are extending the source sign bit.

if (N0.getOpcode() == ISD::ZERO_EXTEND) {

SDValue N00 = N0.getOperand(0);

- if (N00.getScalarValueSizeInBits() == EVTBits &&

+ if (N00.getScalarValueSizeInBits() == ExtVTBits &&

(!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))

return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);

}

// fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.

- if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1)))

- return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType());

+ if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1)))

+ return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT);

// fold operands of sext_in_reg based on knowledge that the top bits are not

// demanded.

@@ -10634,11 +11079,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {

// We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.

if (N0.getOpcode() == ISD::SRL) {

if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))

- if (ShAmt->getAPIntValue().ule(VTBits - EVTBits)) {

+ if (ShAmt->getAPIntValue().ule(VTBits - ExtVTBits)) {

// We can turn this into an SRA iff the input to the SRL is already sign

// extended enough.

unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));

- if (((VTBits - EVTBits) - ShAmt->getZExtValue()) < InSignBits)

+ if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits)

return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0),

N0.getOperand(1));

}

@@ -10650,14 +11095,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {

// extends that the target does support.

if (ISD::isEXTLoad(N0.getNode()) &&

ISD::isUNINDEXEDLoad(N0.getNode()) &&

- EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&

+ ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&

((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&

N0.hasOneUse()) ||

- TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {

+ TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {

LoadSDNode *LN0 = cast<LoadSDNode>(N0);

SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,

LN0->getChain(),

- LN0->getBasePtr(), EVT,

+ LN0->getBasePtr(), ExtVT,

LN0->getMemOperand());

CombineTo(N, ExtLoad);

CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));

@@ -10667,13 +11112,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {

// fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use

if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&

N0.hasOneUse() &&

- EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&

+ ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&

((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&

- TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {

+ TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {

LoadSDNode *LN0 = cast<LoadSDNode>(N0);

SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,

LN0->getChain(),

- LN0->getBasePtr(), EVT,

+ LN0->getBasePtr(), ExtVT,

LN0->getMemOperand());

CombineTo(N, ExtLoad);

CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));

@@ -10681,11 +11126,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {

}

// Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))

- if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) {

+ if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {

if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),

N0.getOperand(1), false))

- return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,

- BSwap, N1);

+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1);

}

return SDValue();

@@ -10695,8 +11139,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {

SDValue N0 = N->getOperand(0);

EVT VT = N->getValueType(0);

+ // sext_vector_inreg(undef) = 0 because the top bit will all be the same.

if (N0.isUndef())

- return DAG.getUNDEF(VT);

+ return DAG.getConstant(0, SDLoc(N), VT);

if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))

return Res;

@@ -10711,8 +11156,9 @@ SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) {

SDValue N0 = N->getOperand(0);

EVT VT = N->getValueType(0);

+ // zext_vector_inreg(undef) = 0 because the top bits will be zero.

if (N0.isUndef())

- return DAG.getUNDEF(VT);

+ return DAG.getConstant(0, SDLoc(N), VT);

if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))

return Res;

@@ -10788,13 +11234,12 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {

SDValue EltNo = N0->getOperand(1);

if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {

int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();

- EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());

int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));

SDLoc DL(N);

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,

DAG.getBitcast(NVT, N0.getOperand(0)),

- DAG.getConstant(Index, DL, IndexTy));

+ DAG.getVectorIdxConstant(Index, DL));

}

@@ -10832,7 +11277,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {

// Attempt to pre-truncate BUILD_VECTOR sources.

if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations &&

- TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType())) {

+ TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&

+ // Avoid creating illegal types if running after type legalizer.

+ (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) {

SDLoc DL(N);

EVT SVT = VT.getScalarType();

SmallVector<SDValue, 8> TruncOps;

@@ -10961,10 +11408,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {

TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) {

SDLoc SL(N);

- EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());

unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1;

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc,

- DAG.getConstant(Idx, SL, IdxVT));

+ DAG.getVectorIdxConstant(Idx, SL));

}

@@ -11064,14 +11510,14 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {

unsigned LD1Bytes = LD1VT.getStoreSize();

if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&

DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {

- unsigned Align = LD1->getAlignment();

- unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(

+ Align Alignment = LD1->getAlign();

+ Align NewAlign = DAG.getDataLayout().getABITypeAlign(

VT.getTypeForEVT(*DAG.getContext()));

- if (NewAlign <= Align &&

+ if (NewAlign <= Alignment &&

(!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))

return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),

- LD1->getPointerInfo(), Align);

+ LD1->getPointerInfo(), Alignment);

}

return SDValue();

@@ -11389,6 +11835,20 @@ SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {

return CombineConsecutiveLoads(N, VT);

}

+SDValue DAGCombiner::visitFREEZE(SDNode *N) {

+ SDValue N0 = N->getOperand(0);

+ // (freeze (freeze x)) -> (freeze x)

+ if (N0.getOpcode() == ISD::FREEZE)

+ return N0;

+ // If the input is a constant, return it.

+ if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0))

+ return N0;

+ return SDValue();

/// We know that BV is a build_vector node with Constant, ConstantFP or Undef

/// operands. DstEltVT indicates the destination element value type.

SDValue DAGCombiner::

@@ -11519,7 +11979,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {

const TargetOptions &Options = DAG.getTarget().Options;

// Floating-point multiply-add with intermediate rounding.

- bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N));

+ bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));

// Floating-point multiply-add without intermediate rounding.

bool HasFMA =

@@ -11532,13 +11992,14 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {

SDNodeFlags Flags = N->getFlags();

bool CanFuse = Options.UnsafeFPMath || isContractable(N);

+ bool CanReassociate =

+ Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();

bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||

CanFuse || HasFMAD);

// If the addition is not contractable, do not combine.

if (!AllowFusionGlobally && !isContractable(N))

return SDValue();

- const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();

if (STI && STI->generateFMAsInMachineCombiner(OptLevel))

return SDValue();

@@ -11573,6 +12034,30 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {

N1.getOperand(0), N1.getOperand(1), N0, Flags);

}

+ // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E)

+ // fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E)

+ // This requires reassociation because it changes the order of operations.

+ SDValue FMA, E;

+ if (CanReassociate && N0.getOpcode() == PreferredFusedOpcode &&

+ N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() &&

+ N0.getOperand(2).hasOneUse()) {

+ FMA = N0;

+ E = N1;

+ } else if (CanReassociate && N1.getOpcode() == PreferredFusedOpcode &&

+ N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() &&

+ N1.getOperand(2).hasOneUse()) {

+ FMA = N1;

+ E = N0;

+ }

+ if (FMA && E) {

+ SDValue A = FMA.getOperand(0);

+ SDValue B = FMA.getOperand(1);

+ SDValue C = FMA.getOperand(2).getOperand(0);

+ SDValue D = FMA.getOperand(2).getOperand(1);

+ SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E, Flags);

+ return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE, Flags);

+ }

// Look through FP_EXTEND nodes to do more combining.

// fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)

@@ -11606,33 +12091,6 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {

// More folding opportunities when target permits.

if (Aggressive) {

- // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))

- if (CanFuse &&

- N0.getOpcode() == PreferredFusedOpcode &&

- N0.getOperand(2).getOpcode() == ISD::FMUL &&

- N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {

- return DAG.getNode(PreferredFusedOpcode, SL, VT,

- N0.getOperand(0), N0.getOperand(1),

- DAG.getNode(PreferredFusedOpcode, SL, VT,

- N0.getOperand(2).getOperand(0),

- N0.getOperand(2).getOperand(1),

- N1, Flags), Flags);

- }

- // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))

- if (CanFuse &&

- N1->getOpcode() == PreferredFusedOpcode &&

- N1.getOperand(2).getOpcode() == ISD::FMUL &&

- N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) {

- return DAG.getNode(PreferredFusedOpcode, SL, VT,

- N1.getOperand(0), N1.getOperand(1),

- DAG.getNode(PreferredFusedOpcode, SL, VT,

- N1.getOperand(2).getOperand(0),

- N1.getOperand(2).getOperand(1),

- N0, Flags), Flags);

- }

// fold (fadd (fma x, y, (fpext (fmul u, v))), z)

// -> (fma x, y, (fma (fpext u), (fpext v), z))

auto FoldFAddFMAFPExtFMul = [&] (

@@ -11736,7 +12194,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {

const TargetOptions &Options = DAG.getTarget().Options;

// Floating-point multiply-add with intermediate rounding.

- bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N));

+ bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));

// Floating-point multiply-add without intermediate rounding.

bool HasFMA =

@@ -11756,13 +12214,13 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {

if (!AllowFusionGlobally && !isContractable(N))

return SDValue();

- const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();

if (STI && STI->generateFMAsInMachineCombiner(OptLevel))

return SDValue();

// Always prefer FMAD to FMA for precision.

unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;

bool Aggressive = TLI.enableAggressiveFMAFusion(VT);

+ bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros();

// Is the node an FMUL and contractable either due to global flags or

// SDNodeFlags.

@@ -11773,19 +12231,43 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {

};

// fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))

- if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {

- return DAG.getNode(PreferredFusedOpcode, SL, VT,

- N0.getOperand(0), N0.getOperand(1),

- DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);

- }

+ auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {

+ if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) {

+ return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),

+ XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z),

+ Flags);

+ }

+ return SDValue();

+ };

// fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)

// Note: Commutes FSUB operands.

- if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {

- return DAG.getNode(PreferredFusedOpcode, SL, VT,

- DAG.getNode(ISD::FNEG, SL, VT,

- N1.getOperand(0)),

- N1.getOperand(1), N0, Flags);

+ auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {

+ if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) {

+ return DAG.getNode(PreferredFusedOpcode, SL, VT,

+ DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),

+ YZ.getOperand(1), X, Flags);

+ }

+ return SDValue();

+ };

+ // If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)),

+ // prefer to fold the multiply with fewer uses.

+ if (isContractableFMUL(N0) && isContractableFMUL(N1) &&

+ (N0.getNode()->use_size() > N1.getNode()->use_size())) {

+ // fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b))

+ if (SDValue V = tryToFoldXSubYZ(N0, N1))

+ return V;

+ // fold (fsub (fmul a, b), (fmul c, d)) -> (fma a, b, (fneg (fmul c, d)))

+ if (SDValue V = tryToFoldXYSubZ(N0, N1))

+ return V;

+ } else {

+ // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))

+ if (SDValue V = tryToFoldXYSubZ(N0, N1))

+ return V;

+ // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)

+ if (SDValue V = tryToFoldXSubYZ(N0, N1))

+ return V;

}

// fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))

@@ -11902,7 +12384,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {

// -> (fma (fneg y), z, (fma (fneg u), v, x))

if (CanFuse && N1.getOpcode() == PreferredFusedOpcode &&

isContractableFMUL(N1.getOperand(2)) &&

- N1->hasOneUse()) {

+ N1->hasOneUse() && NoSignedZero) {

SDValue N20 = N1.getOperand(2).getOperand(0);

SDValue N21 = N1.getOperand(2).getOperand(1);

return DAG.getNode(PreferredFusedOpcode, SL, VT,

@@ -12055,7 +12537,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {

// Floating-point multiply-add with intermediate rounding. This can result

// in a less precise result due to the changed rounding order.

bool HasFMAD = Options.UnsafeFPMath &&

- (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));

+ (LegalOperations && TLI.isFMADLegal(DAG, N));

// No valid opcode, do not combine.

if (!HasFMAD && !HasFMA)

@@ -12132,6 +12614,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {

const TargetOptions &Options = DAG.getTarget().Options;

const SDNodeFlags Flags = N->getFlags();

+ if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))

+ return R;

// fold vector ops

if (VT.isVector())

if (SDValue FoldedVOp = SimplifyVBinOp(N))

@@ -12155,18 +12640,16 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {

return NewSel;

// fold (fadd A, (fneg B)) -> (fsub A, B)

- if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&

- TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize) == 2)

- return DAG.getNode(

- ISD::FSUB, DL, VT, N0,

- TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);

+ if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))

+ if (SDValue NegN1 = TLI.getCheaperNegatedExpression(

+ N1, DAG, LegalOperations, ForCodeSize))

+ return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1, Flags);

// fold (fadd (fneg A), B) -> (fsub B, A)

- if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&

- TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize) == 2)

- return DAG.getNode(

- ISD::FSUB, DL, VT, N1,

- TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize), Flags);

+ if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))

+ if (SDValue NegN0 = TLI.getCheaperNegatedExpression(

+ N0, DAG, LegalOperations, ForCodeSize))

+ return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0, Flags);

auto isFMulNegTwo = [](SDValue FMul) {

if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)

@@ -12311,6 +12794,9 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {

const TargetOptions &Options = DAG.getTarget().Options;

const SDNodeFlags Flags = N->getFlags();

+ if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))

+ return R;

// fold vector ops

if (VT.isVector())

if (SDValue FoldedVOp = SimplifyVBinOp(N))

@@ -12345,8 +12831,9 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {

if (N0CFP && N0CFP->isZero()) {

if (N0CFP->isNegative() ||

(Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {

- if (TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize))

- return TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);

+ if (SDValue NegN1 =

+ TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))

+ return NegN1;

if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))

return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags);

}

@@ -12364,10 +12851,9 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {

}

// fold (fsub A, (fneg B)) -> (fadd A, B)

- if (TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize))

- return DAG.getNode(

- ISD::FADD, DL, VT, N0,

- TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);

+ if (SDValue NegN1 =

+ TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))

+ return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1, Flags);

// FSUB -> FMA combines:

if (SDValue Fused = visitFSUBForFMACombine(N)) {

@@ -12378,21 +12864,6 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {

return SDValue();

}

-/// Return true if both inputs are at least as cheap in negated form and at

-/// least one input is strictly cheaper in negated form.

-bool DAGCombiner::isCheaperToUseNegatedFPOps(SDValue X, SDValue Y) {

- if (char LHSNeg =

- TLI.isNegatibleForFree(X, DAG, LegalOperations, ForCodeSize))

- if (char RHSNeg =

- TLI.isNegatibleForFree(Y, DAG, LegalOperations, ForCodeSize))

- // Both negated operands are at least as cheap as their counterparts.

- // Check to see if at least one is cheaper negated.

- if (LHSNeg == 2 || RHSNeg == 2)

- return true;

- return false;

SDValue DAGCombiner::visitFMUL(SDNode *N) {

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

@@ -12403,6 +12874,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {

const TargetOptions &Options = DAG.getTarget().Options;

const SDNodeFlags Flags = N->getFlags();

+ if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))

+ return R;

// fold vector ops

if (VT.isVector()) {

// This just handles C1 * C2 for vectors. Other vector folds are below.

@@ -12464,13 +12938,18 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {

return DAG.getNode(ISD::FNEG, DL, VT, N0);

// -N0 * -N1 --> N0 * N1

- if (isCheaperToUseNegatedFPOps(N0, N1)) {

- SDValue NegN0 =

- TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);

- SDValue NegN1 =

- TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);

+ TargetLowering::NegatibleCost CostN0 =

+ TargetLowering::NegatibleCost::Expensive;

+ TargetLowering::NegatibleCost CostN1 =

+ TargetLowering::NegatibleCost::Expensive;

+ SDValue NegN0 =

+ TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);

+ SDValue NegN1 =

+ TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);

+ if (NegN0 && NegN1 &&

+ (CostN0 == TargetLowering::NegatibleCost::Cheaper ||

+ CostN1 == TargetLowering::NegatibleCost::Cheaper))

return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags);

- }

// fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))

// fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)

@@ -12549,13 +13028,18 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {

}

// (-N0 * -N1) + N2 --> (N0 * N1) + N2

- if (isCheaperToUseNegatedFPOps(N0, N1)) {

- SDValue NegN0 =

- TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);

- SDValue NegN1 =

- TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);

+ TargetLowering::NegatibleCost CostN0 =

+ TargetLowering::NegatibleCost::Expensive;

+ TargetLowering::NegatibleCost CostN1 =

+ TargetLowering::NegatibleCost::Expensive;

+ SDValue NegN0 =

+ TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);

+ SDValue NegN1 =

+ TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);

+ if (NegN0 && NegN1 &&

+ (CostN0 == TargetLowering::NegatibleCost::Cheaper ||

+ CostN1 == TargetLowering::NegatibleCost::Cheaper))

return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags);

- }

if (UnsafeFPMath) {

if (N0CFP && N0CFP->isZero())

@@ -12641,13 +13125,10 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {

// fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z))

// fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z))

- if (!TLI.isFNegFree(VT) &&

- TLI.isNegatibleForFree(SDValue(N, 0), DAG, LegalOperations,

- ForCodeSize) == 2)

- return DAG.getNode(ISD::FNEG, DL, VT,

- TLI.getNegatedExpression(SDValue(N, 0), DAG,

- LegalOperations, ForCodeSize),

- Flags);

+ if (!TLI.isFNegFree(VT))

+ if (SDValue Neg = TLI.getCheaperNegatedExpression(

+ SDValue(N, 0), DAG, LegalOperations, ForCodeSize))

+ return DAG.getNode(ISD::FNEG, DL, VT, Neg, Flags);

return SDValue();

}

@@ -12664,7 +13145,7 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {

// that only minsize should restrict this.

bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;

const SDNodeFlags Flags = N->getFlags();

- if (!UnsafeMath && !Flags.hasAllowReciprocal())

+ if (LegalDAG || (!UnsafeMath && !Flags.hasAllowReciprocal()))

return SDValue();

// Skip if current node is a reciprocal/fneg-reciprocal.

@@ -12735,6 +13216,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {

const TargetOptions &Options = DAG.getTarget().Options;

SDNodeFlags Flags = N->getFlags();

+ if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))

+ return R;

// fold vector ops

if (VT.isVector())

if (SDValue FoldedVOp = SimplifyVBinOp(N))

@@ -12794,37 +13278,62 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {

} else if (N1.getOpcode() == ISD::FMUL) {

// Look through an FMUL. Even though this won't remove the FDIV directly,

// it's still worthwhile to get rid of the FSQRT if possible.

- SDValue SqrtOp;

- SDValue OtherOp;

+ SDValue Sqrt, Y;

if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {

- SqrtOp = N1.getOperand(0);

- OtherOp = N1.getOperand(1);

+ Sqrt = N1.getOperand(0);

+ Y = N1.getOperand(1);

} else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {

- SqrtOp = N1.getOperand(1);

- OtherOp = N1.getOperand(0);

+ Sqrt = N1.getOperand(1);

+ Y = N1.getOperand(0);

}

- if (SqrtOp.getNode()) {

+ if (Sqrt.getNode()) {

+ // If the other multiply operand is known positive, pull it into the

+ // sqrt. That will eliminate the division if we convert to an estimate:

+ // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z)

+ // TODO: Also fold the case where A == Z (fabs is missing).

+ if (Flags.hasAllowReassociation() && N1.hasOneUse() &&

+ N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse() &&

+ Y.getOpcode() == ISD::FABS && Y.hasOneUse()) {

+ SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, Y.getOperand(0),

+ Y.getOperand(0), Flags);

+ SDValue AAZ =

+ DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0), Flags);

+ if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))

+ return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt, Flags);

+ // Estimate creation failed. Clean up speculatively created nodes.

+ recursivelyDeleteUnusedNodes(AAZ.getNode());

+ }

// We found a FSQRT, so try to make this fold:

- // x / (y * sqrt(z)) -> x * (rsqrt(z) / y)

- if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {

- RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags);

- AddToWorklist(RV.getNode());

- return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);

+ // X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y)

+ if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) {

+ SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y, Flags);

+ AddToWorklist(Div.getNode());

+ return DAG.getNode(ISD::FMUL, DL, VT, N0, Div, Flags);

}

// Fold into a reciprocal estimate and multiply instead of a real divide.

- if (SDValue RV = BuildDivEstimate(N0, N1, Flags))

- return RV;

+ if (Options.NoInfsFPMath || Flags.hasNoInfs())

+ if (SDValue RV = BuildDivEstimate(N0, N1, Flags))

+ return RV;

}

// (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)

- if (isCheaperToUseNegatedFPOps(N0, N1))

- return DAG.getNode(

- ISD::FDIV, SDLoc(N), VT,

- TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize),

- TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);

+ TargetLowering::NegatibleCost CostN0 =

+ TargetLowering::NegatibleCost::Expensive;

+ TargetLowering::NegatibleCost CostN1 =

+ TargetLowering::NegatibleCost::Expensive;

+ SDValue NegN0 =

+ TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);

+ SDValue NegN1 =

+ TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);

+ if (NegN0 && NegN1 &&

+ (CostN0 == TargetLowering::NegatibleCost::Cheaper ||

+ CostN1 == TargetLowering::NegatibleCost::Cheaper))

+ return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1, Flags);

return SDValue();

}

@@ -12835,6 +13344,10 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {

ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);

ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);

EVT VT = N->getValueType(0);

+ SDNodeFlags Flags = N->getFlags();

+ if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))

+ return R;

// fold (frem c1, c2) -> fmod(c1,c2)

if (N0CFP && N1CFP)

@@ -12848,8 +13361,12 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {

SDValue DAGCombiner::visitFSQRT(SDNode *N) {

SDNodeFlags Flags = N->getFlags();

- if (!DAG.getTarget().Options.UnsafeFPMath &&

- !Flags.hasApproximateFuncs())

+ const TargetOptions &Options = DAG.getTarget().Options;

+ // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as:

+ // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN

+ if ((!Options.UnsafeFPMath && !Flags.hasApproximateFuncs()) ||

+ (!Options.NoInfsFPMath && !Flags.hasNoInfs()))

return SDValue();

SDValue N0 = N->getOperand(0);

@@ -13061,33 +13578,24 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {

}

// The next optimizations are desirable only if SELECT_CC can be lowered.

- if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {

- // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)

- if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&

- !VT.isVector() &&

- (!LegalOperations ||

- TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {

- SDLoc DL(N);

- SDValue Ops[] =

- { N0.getOperand(0), N0.getOperand(1),

- DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),

- N0.getOperand(2) };

- return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);

- }

+ // fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0)

+ if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&

+ !VT.isVector() &&

+ (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {

+ SDLoc DL(N);

+ return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT),

+ DAG.getConstantFP(0.0, DL, VT));

+ }

- // fold (sint_to_fp (zext (setcc x, y, cc))) ->

- // (select_cc x, y, 1.0, 0.0,, cc)

- if (N0.getOpcode() == ISD::ZERO_EXTEND &&

- N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&

- (!LegalOperations ||

- TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {

- SDLoc DL(N);

- SDValue Ops[] =

- { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),

- DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),

- N0.getOperand(0).getOperand(2) };

- return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);

- }

+ // fold (sint_to_fp (zext (setcc x, y, cc))) ->

+ // (select (setcc x, y, cc), 1.0, 0.0)

+ if (N0.getOpcode() == ISD::ZERO_EXTEND &&

+ N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() &&

+ (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {

+ SDLoc DL(N);

+ return DAG.getSelect(DL, VT, N0.getOperand(0),

+ DAG.getConstantFP(1.0, DL, VT),

+ DAG.getConstantFP(0.0, DL, VT));

}

if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))

@@ -13121,19 +13629,12 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {

return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);

}

- // The next optimizations are desirable only if SELECT_CC can be lowered.

- if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {

- // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)

- if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&

- (!LegalOperations ||

- TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {

- SDLoc DL(N);

- SDValue Ops[] =

- { N0.getOperand(0), N0.getOperand(1),

- DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),

- N0.getOperand(2) };

- return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);

- }

+ // fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0)

+ if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&

+ (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {

+ SDLoc DL(N);

+ return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT),

+ DAG.getConstantFP(0.0, DL, VT));

}

if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))

@@ -13378,12 +13879,14 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {

if (isConstantFPBuildVectorOrConstantFP(N0))

return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);

- if (TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize))

- return TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);

+ if (SDValue NegN0 =

+ TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize))

+ return NegN0;

- // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 FIXME: This is

- // duplicated in isNegatibleForFree, but isNegatibleForFree doesn't know it

- // was called from a context with a nsz flag if the input fsub does not.

+ // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0

+ // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't

+ // know it was called from a context with a nsz flag if the input fsub does

+ // not.

if (N0.getOpcode() == ISD::FSUB &&

(DAG.getTarget().Options.NoSignedZerosFPMath ||

N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) {

@@ -13539,8 +14042,12 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {

}

if (N1.hasOneUse()) {

+ // rebuildSetCC calls visitXor which may change the Chain when there is a

+ // STRICT_FSETCC/STRICT_FSETCCS involved. Use a handle to track changes.

+ HandleSDNode ChainHandle(Chain);

if (SDValue NewN1 = rebuildSetCC(N1))

- return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, NewN1, N2);

+ return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other,

+ ChainHandle.getValue(), NewN1, N2);

}

return SDValue();

@@ -13592,8 +14099,8 @@ SDValue DAGCombiner::rebuildSetCC(SDValue N) {

}

- // Transform br(xor(x, y)) -> br(x != y)

- // Transform br(xor(xor(x,y), 1)) -> br (x == y)

+ // Transform (brcond (xor x, y)) -> (brcond (setcc, x, y, ne))

+ // Transform (brcond (xor (xor x, y), -1)) -> (brcond (setcc, x, y, eq))

if (N.getOpcode() == ISD::XOR) {

// Because we may call this on a speculatively constructed

// SimplifiedSetCC Node, we need to simplify this node first.

@@ -13617,16 +14124,17 @@ SDValue DAGCombiner::rebuildSetCC(SDValue N) {

if (N.getOpcode() != ISD::XOR)

return N;

- SDNode *TheXor = N.getNode();

- SDValue Op0 = TheXor->getOperand(0);

- SDValue Op1 = TheXor->getOperand(1);

+ SDValue Op0 = N->getOperand(0);

+ SDValue Op1 = N->getOperand(1);

if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {

bool Equal = false;

- if (isOneConstant(Op0) && Op0.hasOneUse() &&

- Op0.getOpcode() == ISD::XOR) {

- TheXor = Op0.getNode();

+ // (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq))

+ if (isBitwiseNot(N) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR &&

+ Op0.getValueType() == MVT::i1) {

+ N = Op0;

+ Op0 = N->getOperand(0);

+ Op1 = N->getOperand(1);

Equal = true;

}

@@ -13634,7 +14142,7 @@ SDValue DAGCombiner::rebuildSetCC(SDValue N) {

if (LegalTypes)

SetCCVT = getSetCCResultType(SetCCVT);

// Replace the uses of XOR with SETCC

- return DAG.getSetCC(SDLoc(TheXor), SetCCVT, Op0, Op1,

+ return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1,

Equal ? ISD::SETEQ : ISD::SETNE);

}

@@ -13994,118 +14502,142 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {

return true;

}

-/// Try to combine a load/store with a add/sub of the base pointer node into a

-/// post-indexed load/store. The transformation folded the add/subtract into the

-/// new indexed load/store effectively and all of its uses are redirected to the

-/// new load/store.

-bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {

- if (Level < AfterLegalizeDAG)

+static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,

+ SDValue &BasePtr, SDValue &Offset,

+ ISD::MemIndexedMode &AM,

+ SelectionDAG &DAG,

+ const TargetLowering &TLI) {

+ if (PtrUse == N ||

+ (PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB))

return false;

- bool IsLoad = true;

- bool IsMasked = false;

- SDValue Ptr;

- if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, IsMasked,

- Ptr, TLI))

+ if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG))

return false;

- if (Ptr.getNode()->hasOneUse())

+ // Don't create a indexed load / store with zero offset.

+ if (isNullConstant(Offset))

return false;

- for (SDNode *Op : Ptr.getNode()->uses()) {

- if (Op == N ||

- (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))

- continue;

+ if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))

+ return false;

- SDValue BasePtr;

- SDValue Offset;

- ISD::MemIndexedMode AM = ISD::UNINDEXED;

- if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) {

- // Don't create a indexed load / store with zero offset.

- if (isNullConstant(Offset))

- continue;

+ SmallPtrSet<const SDNode *, 32> Visited;

+ for (SDNode *Use : BasePtr.getNode()->uses()) {

+ if (Use == Ptr.getNode())

+ continue;

- // Try turning it into a post-indexed load / store except when

- // 1) All uses are load / store ops that use it as base ptr (and

- // it may be folded as addressing mmode).

- // 2) Op must be independent of N, i.e. Op is neither a predecessor

- // nor a successor of N. Otherwise, if Op is folded that would

- // create a cycle.

+ // No if there's a later user which could perform the index instead.

+ if (isa<MemSDNode>(Use)) {

+ bool IsLoad = true;

+ bool IsMasked = false;

+ SDValue OtherPtr;

+ if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,

+ IsMasked, OtherPtr, TLI)) {

+ SmallVector<const SDNode *, 2> Worklist;

+ Worklist.push_back(Use);

+ if (SDNode::hasPredecessorHelper(N, Visited, Worklist))

+ return false;

+ }

- if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))

- continue;

+ // If all the uses are load / store addresses, then don't do the

+ // transformation.

+ if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {

+ for (SDNode *UseUse : Use->uses())

+ if (canFoldInAddressingMode(Use, UseUse, DAG, TLI))

+ return false;

+ }

+ return true;

- // Check for #1.

- bool TryNext = false;

- for (SDNode *Use : BasePtr.getNode()->uses()) {

- if (Use == Ptr.getNode())

- continue;

+static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,

+ bool &IsMasked, SDValue &Ptr,

+ SDValue &BasePtr, SDValue &Offset,

+ ISD::MemIndexedMode &AM,

+ SelectionDAG &DAG,

+ const TargetLowering &TLI) {

+ if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad,

+ IsMasked, Ptr, TLI) ||

+ Ptr.getNode()->hasOneUse())

+ return nullptr;

+ // Try turning it into a post-indexed load / store except when

+ // 1) All uses are load / store ops that use it as base ptr (and

+ // it may be folded as addressing mmode).

+ // 2) Op must be independent of N, i.e. Op is neither a predecessor

+ // nor a successor of N. Otherwise, if Op is folded that would

+ // create a cycle.

+ for (SDNode *Op : Ptr->uses()) {

+ // Check for #1.

+ if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))

+ continue;

- // If all the uses are load / store addresses, then don't do the

- // transformation.

- if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {

- bool RealUse = false;

- for (SDNode *UseUse : Use->uses()) {

- if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))

- RealUse = true;

- }

+ // Check for #2.

+ SmallPtrSet<const SDNode *, 32> Visited;

+ SmallVector<const SDNode *, 8> Worklist;

+ // Ptr is predecessor to both N and Op.

+ Visited.insert(Ptr.getNode());

+ Worklist.push_back(N);

+ Worklist.push_back(Op);

+ if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&

+ !SDNode::hasPredecessorHelper(Op, Visited, Worklist))

+ return Op;

+ }

+ return nullptr;

- if (!RealUse) {

- TryNext = true;

- break;

- }

+/// Try to combine a load/store with a add/sub of the base pointer node into a

+/// post-indexed load/store. The transformation folded the add/subtract into the

+/// new indexed load/store effectively and all of its uses are redirected to the

+/// new load/store.

+bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {

+ if (Level < AfterLegalizeDAG)

+ return false;

- if (TryNext)

- continue;

+ bool IsLoad = true;

+ bool IsMasked = false;

+ SDValue Ptr;

+ SDValue BasePtr;

+ SDValue Offset;

+ ISD::MemIndexedMode AM = ISD::UNINDEXED;

+ SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr,

+ Offset, AM, DAG, TLI);

+ if (!Op)

+ return false;

- // Check for #2.

- SmallPtrSet<const SDNode *, 32> Visited;

- SmallVector<const SDNode *, 8> Worklist;

- // Ptr is predecessor to both N and Op.

- Visited.insert(Ptr.getNode());

- Worklist.push_back(N);

- Worklist.push_back(Op);

- if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&

- !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) {

- SDValue Result;

- if (!IsMasked)

- Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,

- Offset, AM)

- : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),

+ SDValue Result;

+ if (!IsMasked)

+ Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,

+ Offset, AM)

+ : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),

+ BasePtr, Offset, AM);

+ else

+ Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),

+ BasePtr, Offset, AM)

+ : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),

BasePtr, Offset, AM);

- else

- Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),

- BasePtr, Offset, AM)

- : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),

- BasePtr, Offset, AM);

- ++PostIndexedNodes;

- ++NodesCombined;

- LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);

- dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);

- dbgs() << '\n');

- WorklistRemover DeadNodes(*this);

- if (IsLoad) {

- DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));

- DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));

- } else {

- DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));

- }

- // Finally, since the node is now dead, remove it from the graph.

- deleteAndRecombine(N);

- // Replace the uses of Use with uses of the updated base value.

- DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),

- Result.getValue(IsLoad ? 1 : 0));

- deleteAndRecombine(Op);

- return true;

- }

+ ++PostIndexedNodes;

+ ++NodesCombined;

+ LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);

+ dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);

+ dbgs() << '\n');

+ WorklistRemover DeadNodes(*this);

+ if (IsLoad) {

+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));

+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));

+ } else {

+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));

}

- return false;

+ // Finally, since the node is now dead, remove it from the graph.

+ deleteAndRecombine(N);

+ // Replace the uses of Use with uses of the updated base value.

+ DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),

+ Result.getValue(IsLoad ? 1 : 0));

+ deleteAndRecombine(Op);

+ return true;

}

/// Return the base-pointer arithmetic from an indexed \p LD.

@@ -14222,11 +14754,11 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {

auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {

if (LD->isIndexed()) {

- bool IsSub = (LD->getAddressingMode() == ISD::PRE_DEC ||

- LD->getAddressingMode() == ISD::POST_DEC);

- unsigned Opc = IsSub ? ISD::SUB : ISD::ADD;

- SDValue Idx = DAG.getNode(Opc, SDLoc(LD), LD->getOperand(1).getValueType(),

- LD->getOperand(1), LD->getOperand(2));

+ // Cannot handle opaque target constants and we must respect the user's

+ // request not to split indexes from loads.

+ if (!canSplitIdx(LD))

+ return SDValue();

+ SDValue Idx = SplitIndexingFromLoad(LD);

SDValue Ops[] = {Val, Idx, Chain};

return CombineTo(LD, Ops, 3);

}

@@ -14322,14 +14854,12 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {

// the indexing into an add/sub directly (that TargetConstant may not be

// valid for a different type of node, and we cannot convert an opaque

// target constant into a regular constant).

- bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant &&

- cast<ConstantSDNode>(LD->getOperand(2))->isOpaque();

+ bool CanSplitIdx = canSplitIdx(LD);

- if (!N->hasAnyUseOfValue(0) &&

- ((MaySplitLoadIndex && !HasOTCInc) || !N->hasAnyUseOfValue(1))) {

+ if (!N->hasAnyUseOfValue(0) && (CanSplitIdx || !N->hasAnyUseOfValue(1))) {

SDValue Undef = DAG.getUNDEF(N->getValueType(0));

SDValue Index;

- if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) {

+ if (N->hasAnyUseOfValue(1) && CanSplitIdx) {

Index = SplitIndexingFromLoad(LD);

// Try to fold the base pointer arithmetic into subsequent loads and

// stores.

@@ -14356,11 +14886,12 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {

// Try to infer better alignment information than the load already has.

if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) {

- if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {

- if (Align > LD->getAlignment() && LD->getSrcValueOffset() % Align == 0) {

+ if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {

+ if (*Alignment > LD->getAlign() &&

+ isAligned(*Alignment, LD->getSrcValueOffset())) {

SDValue NewLoad = DAG.getExtLoad(

LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,

- LD->getPointerInfo(), LD->getMemoryVT(), Align,

+ LD->getPointerInfo(), LD->getMemoryVT(), *Alignment,

LD->getMemOperand()->getFlags(), LD->getAAInfo());

// NewLoad will always be N as we are only refining the alignment

assert(NewLoad.getNode() == N);

@@ -14557,11 +15088,11 @@ struct LoadedSlice {

}

/// Get the alignment of the load used for this slice.

- unsigned getAlignment() const {

- unsigned Alignment = Origin->getAlignment();

+ Align getAlign() const {

+ Align Alignment = Origin->getAlign();

uint64_t Offset = getOffsetFromBase();

if (Offset != 0)

- Alignment = MinAlign(Alignment, Alignment + Offset);

+ Alignment = commonAlignment(Alignment, Alignment.value() + Offset);

return Alignment;

}

@@ -14657,8 +15188,8 @@ struct LoadedSlice {

// Create the load for the slice.

SDValue LastInst =

DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,

- Origin->getPointerInfo().getWithOffset(Offset),

- getAlignment(), Origin->getMemOperand()->getFlags());

+ Origin->getPointerInfo().getWithOffset(Offset), getAlign(),

+ Origin->getMemOperand()->getFlags());

// If the final type is not the same as the loaded type, this means that

// we have to pad with zero. Create a zero extend for that.

EVT FinalType = Inst->getValueType(0);

@@ -14699,10 +15230,10 @@ struct LoadedSlice {

// Check if it will be merged with the load.

// 1. Check the alignment constraint.

- unsigned RequiredAlignment = DAG->getDataLayout().getABITypeAlignment(

+ Align RequiredAlignment = DAG->getDataLayout().getABITypeAlign(

ResVT.getTypeForEVT(*DAG->getContext()));

- if (RequiredAlignment > getAlignment())

+ if (RequiredAlignment > getAlign())

return false;

// 2. Check that the load is a legal operation for that type.

@@ -14788,14 +15319,14 @@ static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,

continue;

// Check if the target supplies paired loads for this type.

- unsigned RequiredAlignment = 0;

+ Align RequiredAlignment;

if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {

// move to the next pair, this type is hopeless.

Second = nullptr;

continue;

}

// Check if we meet the alignment requirement.

- if (RequiredAlignment > First->getAlignment())

+ if (First->getAlign() < RequiredAlignment)

continue;

// Check that both loads are next to each other in memory.

@@ -14868,6 +15399,12 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {

!LD->getValueType(0).isInteger())

return false;

+ // The algorithm to split up a load of a scalable vector into individual

+ // elements currently requires knowing the length of the loaded type,

+ // so will need adjusting to work on scalable vectors.

+ if (LD->getValueType(0).isScalableVector())

+ return false;

// Keep track of already used bits to detect overlapping values.

// In that case, we will just abort the transformation.

APInt UsedBits(LD->getValueSizeInBits(0), 0);

@@ -15112,7 +15649,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {

// Y is known to provide just those bytes. If so, we try to replace the

// load + replace + store sequence with a single (narrower) store, which makes

// the load dead.

- if (Opc == ISD::OR) {

+ if (Opc == ISD::OR && EnableShrinkLoadReplaceStoreWithStore) {

std::pair<unsigned, unsigned> MaskedLoad;

MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);

if (MaskedLoad.first)

@@ -15128,6 +15665,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {

return NewST;

}

+ if (!EnableReduceLoadOpStoreWidth)

+ return SDValue();

if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||

Value.getOperand(1).getOpcode() != ISD::Constant)

return SDValue();

@@ -15181,9 +15721,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {

if (DAG.getDataLayout().isBigEndian())

PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;

- unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);

+ Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);

Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());

- if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy))

+ if (NewAlign < DAG.getDataLayout().getABITypeAlign(NewVTTy))

return SDValue();

SDValue NewPtr = DAG.getMemBasePlusOffset(Ptr, PtrOff, SDLoc(LD));

@@ -15229,17 +15769,24 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {

ST->getPointerInfo().getAddrSpace() != 0)

return SDValue();

- EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

+ TypeSize VTSize = VT.getSizeInBits();

+ // We don't know the size of scalable types at compile time so we cannot

+ // create an integer of the equivalent size.

+ if (VTSize.isScalable())

+ return SDValue();

+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VTSize.getFixedSize());

if (!TLI.isOperationLegal(ISD::LOAD, IntVT) ||

!TLI.isOperationLegal(ISD::STORE, IntVT) ||

!TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) ||

!TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT))

return SDValue();

- unsigned LDAlign = LD->getAlignment();

- unsigned STAlign = ST->getAlignment();

+ Align LDAlign = LD->getAlign();

+ Align STAlign = ST->getAlign();

Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());

- unsigned ABIAlign = DAG.getDataLayout().getABITypeAlignment(IntVTTy);

+ Align ABIAlign = DAG.getDataLayout().getABITypeAlign(IntVTTy);

if (LDAlign < ABIAlign || STAlign < ABIAlign)

return SDValue();

@@ -15356,7 +15903,7 @@ SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,

return DAG.getTokenFactor(StoreDL, Chains);

}

-bool DAGCombiner::MergeStoresOfConstantsOrVecElts(

+bool DAGCombiner::mergeStoresOfConstantsOrVecElts(

SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,

bool IsConstantSrc, bool UseVector, bool UseTrunc) {

// Make sure we have something to merge.

@@ -15530,14 +16077,12 @@ void DAGCombiner::getStoreMergeCandidates(

if (BasePtr.getBase().isUndef())

return;

- bool IsConstantSrc = isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val);

- bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

- Val.getOpcode() == ISD::EXTRACT_SUBVECTOR);

- bool IsLoadSrc = isa<LoadSDNode>(Val);

+ StoreSource StoreSrc = getStoreSource(Val);

+ assert(StoreSrc != StoreSource::Unknown && "Expected known source for store");

BaseIndexOffset LBasePtr;

// Match on loadbaseptr if relevant.

EVT LoadVT;

- if (IsLoadSrc) {

+ if (StoreSrc == StoreSource::Load) {

auto *Ld = cast<LoadSDNode>(Val);

LBasePtr = BaseIndexOffset::match(Ld, DAG);

LoadVT = Ld->getMemoryVT();

@@ -15565,7 +16110,7 @@ void DAGCombiner::getStoreMergeCandidates(

// Allow merging constants of different types as integers.

bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())

: Other->getMemoryVT() != MemVT;

- if (IsLoadSrc) {

+ if (StoreSrc == StoreSource::Load) {

if (NoTypeMatch)

return false;

// The Load's Base Ptr must also match

@@ -15589,13 +16134,13 @@ void DAGCombiner::getStoreMergeCandidates(

} else

return false;

}

- if (IsConstantSrc) {

+ if (StoreSrc == StoreSource::Constant) {

if (NoTypeMatch)

return false;

if (!(isa<ConstantSDNode>(OtherBC) || isa<ConstantFPSDNode>(OtherBC)))

return false;

}

- if (IsExtractVecSrc) {

+ if (StoreSrc == StoreSource::Extract) {

// Do not merge truncated stores here.

if (Other->isTruncatingStore())

return false;

@@ -15736,77 +16281,22 @@ bool DAGCombiner::checkMergeStoreCandidatesForDependencies(

return true;

}

-bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {

- if (OptLevel == CodeGenOpt::None || !EnableStoreMerging)

- return false;

- EVT MemVT = St->getMemoryVT();

- int64_t ElementSizeBytes = MemVT.getStoreSize();

- unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;

- if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)

- return false;

- bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute(

- Attribute::NoImplicitFloat);

- // This function cannot currently deal with non-byte-sized memory sizes.

- if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits())

- return false;

- if (!MemVT.isSimple())

- return false;

- // Perform an early exit check. Do not bother looking at stored values that

- // are not constants, loads, or extracted vector elements.

- SDValue StoredVal = peekThroughBitcasts(St->getValue());

- bool IsLoadSrc = isa<LoadSDNode>(StoredVal);

- bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||

- isa<ConstantFPSDNode>(StoredVal);

- bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

- StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);

- bool IsNonTemporalStore = St->isNonTemporal();

- bool IsNonTemporalLoad =

- IsLoadSrc && cast<LoadSDNode>(StoredVal)->isNonTemporal();

- if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)

- return false;

- SmallVector<MemOpLink, 8> StoreNodes;

- SDNode *RootNode;

- // Find potential store merge candidates by searching through chain sub-DAG

- getStoreMergeCandidates(St, StoreNodes, RootNode);

- // Check if there is anything to merge.

- if (StoreNodes.size() < 2)

- return false;

- // Sort the memory operands according to their distance from the

- // base pointer.

- llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) {

- return LHS.OffsetFromBase < RHS.OffsetFromBase;

- });

- // Store Merge attempts to merge the lowest stores. This generally

- // works out as if successful, as the remaining stores are checked

- // after the first collection of stores is merged. However, in the

- // case that a non-mergeable store is found first, e.g., {p[-2],

- // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent

- // mergeable cases. To prevent this, we prune such stores from the

- // front of StoreNodes here.

- bool RV = false;

- while (StoreNodes.size() > 1) {

+unsigned

+DAGCombiner::getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,

+ int64_t ElementSizeBytes) const {

+ while (true) {

+ // Find a store past the width of the first store.

size_t StartIdx = 0;

while ((StartIdx + 1 < StoreNodes.size()) &&

StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=

- StoreNodes[StartIdx + 1].OffsetFromBase)

+ StoreNodes[StartIdx + 1].OffsetFromBase)

++StartIdx;

// Bail if we don't have enough candidates to merge.

if (StartIdx + 1 >= StoreNodes.size())

- return RV;

+ return 0;

+ // Trim stores that overlapped with the first store.

if (StartIdx)

StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);

@@ -15822,302 +16312,345 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {

break;

NumConsecutiveStores = i + 1;

}

+ if (NumConsecutiveStores > 1)

+ return NumConsecutiveStores;

- if (NumConsecutiveStores < 2) {

- StoreNodes.erase(StoreNodes.begin(),

- StoreNodes.begin() + NumConsecutiveStores);

- continue;

- }

- // The node with the lowest store address.

- LLVMContext &Context = *DAG.getContext();

- const DataLayout &DL = DAG.getDataLayout();

- // Store the constants into memory as one consecutive store.

- if (IsConstantSrc) {

- while (NumConsecutiveStores >= 2) {

- LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;

- unsigned FirstStoreAS = FirstInChain->getAddressSpace();

- unsigned FirstStoreAlign = FirstInChain->getAlignment();

- unsigned LastLegalType = 1;

- unsigned LastLegalVectorType = 1;

- bool LastIntegerTrunc = false;

- bool NonZero = false;

- unsigned FirstZeroAfterNonZero = NumConsecutiveStores;

- for (unsigned i = 0; i < NumConsecutiveStores; ++i) {

- StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);

- SDValue StoredVal = ST->getValue();

- bool IsElementZero = false;

- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))

- IsElementZero = C->isNullValue();

- else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))

- IsElementZero = C->getConstantFPValue()->isNullValue();

- if (IsElementZero) {

- if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)

- FirstZeroAfterNonZero = i;

- }

- NonZero |= !IsElementZero;

- // Find a legal type for the constant store.

- unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;

- EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);

- bool IsFast = false;

+ // There are no consecutive stores at the start of the list.

+ // Remove the first store and try again.

+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);

+ }

- // Break early when size is too large to be legal.

- if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)

- break;

+bool DAGCombiner::tryStoreMergeOfConstants(

+ SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,

+ EVT MemVT, SDNode *RootNode, bool AllowVectors) {

+ LLVMContext &Context = *DAG.getContext();

+ const DataLayout &DL = DAG.getDataLayout();

+ int64_t ElementSizeBytes = MemVT.getStoreSize();

+ unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;

+ bool MadeChange = false;

+ // Store the constants into memory as one consecutive store.

+ while (NumConsecutiveStores >= 2) {

+ LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;

+ unsigned FirstStoreAS = FirstInChain->getAddressSpace();

+ unsigned FirstStoreAlign = FirstInChain->getAlignment();

+ unsigned LastLegalType = 1;

+ unsigned LastLegalVectorType = 1;

+ bool LastIntegerTrunc = false;

+ bool NonZero = false;

+ unsigned FirstZeroAfterNonZero = NumConsecutiveStores;

+ for (unsigned i = 0; i < NumConsecutiveStores; ++i) {

+ StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);

+ SDValue StoredVal = ST->getValue();

+ bool IsElementZero = false;

+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))

+ IsElementZero = C->isNullValue();

+ else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))

+ IsElementZero = C->getConstantFPValue()->isNullValue();

+ if (IsElementZero) {

+ if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)

+ FirstZeroAfterNonZero = i;

+ }

+ NonZero |= !IsElementZero;

- if (TLI.isTypeLegal(StoreTy) &&

- TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&

- TLI.allowsMemoryAccess(Context, DL, StoreTy,

- *FirstInChain->getMemOperand(), &IsFast) &&

- IsFast) {

- LastIntegerTrunc = false;

- LastLegalType = i + 1;

- // Or check whether a truncstore is legal.

- } else if (TLI.getTypeAction(Context, StoreTy) ==

- TargetLowering::TypePromoteInteger) {

- EVT LegalizedStoredValTy =

- TLI.getTypeToTransformTo(Context, StoredVal.getValueType());

- if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&

- TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&

- TLI.allowsMemoryAccess(Context, DL, StoreTy,

- *FirstInChain->getMemOperand(),

- &IsFast) &&

- IsFast) {

- LastIntegerTrunc = true;

- LastLegalType = i + 1;

- }

+ // Find a legal type for the constant store.

+ unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;

+ EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);

+ bool IsFast = false;

- // We only use vectors if the constant is known to be zero or the

- // target allows it and the function is not marked with the

- // noimplicitfloat attribute.

- if ((!NonZero ||

- TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&

- !NoVectors) {

- // Find a legal type for the vector store.

- unsigned Elts = (i + 1) * NumMemElts;

- EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);

- if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&

- TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&

- TLI.allowsMemoryAccess(

- Context, DL, Ty, *FirstInChain->getMemOperand(), &IsFast) &&

- IsFast)

- LastLegalVectorType = i + 1;

- }

+ // Break early when size is too large to be legal.

+ if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)

+ break;

- bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;

- unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;

- // Check if we found a legal integer type that creates a meaningful

- // merge.

- if (NumElem < 2) {

- // We know that candidate stores are in order and of correct

- // shape. While there is no mergeable sequence from the

- // beginning one may start later in the sequence. The only

- // reason a merge of size N could have failed where another of

- // the same size would not have, is if the alignment has

- // improved or we've dropped a non-zero value. Drop as many

- // candidates as we can here.

- unsigned NumSkip = 1;

- while (

- (NumSkip < NumConsecutiveStores) &&

- (NumSkip < FirstZeroAfterNonZero) &&

- (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))

- NumSkip++;

- StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);

- NumConsecutiveStores -= NumSkip;

- continue;

+ if (TLI.isTypeLegal(StoreTy) &&

+ TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&

+ TLI.allowsMemoryAccess(Context, DL, StoreTy,

+ *FirstInChain->getMemOperand(), &IsFast) &&

+ IsFast) {

+ LastIntegerTrunc = false;

+ LastLegalType = i + 1;

+ // Or check whether a truncstore is legal.

+ } else if (TLI.getTypeAction(Context, StoreTy) ==

+ TargetLowering::TypePromoteInteger) {

+ EVT LegalizedStoredValTy =

+ TLI.getTypeToTransformTo(Context, StoredVal.getValueType());

+ if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&

+ TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&

+ TLI.allowsMemoryAccess(Context, DL, StoreTy,

+ *FirstInChain->getMemOperand(), &IsFast) &&

+ IsFast) {

+ LastIntegerTrunc = true;

+ LastLegalType = i + 1;

}

+ }

- // Check that we can merge these candidates without causing a cycle.

- if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,

- RootNode)) {

- StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);

- NumConsecutiveStores -= NumElem;

- continue;

- }

+ // We only use vectors if the constant is known to be zero or the

+ // target allows it and the function is not marked with the

+ // noimplicitfloat attribute.

+ if ((!NonZero ||

+ TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&

+ AllowVectors) {

+ // Find a legal type for the vector store.

+ unsigned Elts = (i + 1) * NumMemElts;

+ EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);

+ if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&

+ TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&

+ TLI.allowsMemoryAccess(Context, DL, Ty,

+ *FirstInChain->getMemOperand(), &IsFast) &&

+ IsFast)

+ LastLegalVectorType = i + 1;

+ }

- RV |= MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, true,

- UseVector, LastIntegerTrunc);

+ bool UseVector = (LastLegalVectorType > LastLegalType) && AllowVectors;

+ unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;

+ // Check if we found a legal integer type that creates a meaningful

+ // merge.

+ if (NumElem < 2) {

+ // We know that candidate stores are in order and of correct

+ // shape. While there is no mergeable sequence from the

+ // beginning one may start later in the sequence. The only

+ // reason a merge of size N could have failed where another of

+ // the same size would not have, is if the alignment has

+ // improved or we've dropped a non-zero value. Drop as many

+ // candidates as we can here.

+ unsigned NumSkip = 1;

+ while ((NumSkip < NumConsecutiveStores) &&

+ (NumSkip < FirstZeroAfterNonZero) &&

+ (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))

+ NumSkip++;

+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);

+ NumConsecutiveStores -= NumSkip;

+ continue;

+ }

- // Remove merged stores for next iteration.

- StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);

- NumConsecutiveStores -= NumElem;

- }

+ // Check that we can merge these candidates without causing a cycle.

+ if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,

+ RootNode)) {

+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);

+ NumConsecutiveStores -= NumElem;

continue;

}

- // When extracting multiple vector elements, try to store them

- // in one vector store rather than a sequence of scalar stores.

- if (IsExtractVecSrc) {

- // Loop on Consecutive Stores on success.

- while (NumConsecutiveStores >= 2) {

- LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;

- unsigned FirstStoreAS = FirstInChain->getAddressSpace();

- unsigned FirstStoreAlign = FirstInChain->getAlignment();

- unsigned NumStoresToMerge = 1;

- for (unsigned i = 0; i < NumConsecutiveStores; ++i) {

- // Find a legal type for the vector store.

- unsigned Elts = (i + 1) * NumMemElts;

- EVT Ty =

- EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);

- bool IsFast;

- // Break early when size is too large to be legal.

- if (Ty.getSizeInBits() > MaximumLegalStoreInBits)

- break;

+ MadeChange |= mergeStoresOfConstantsOrVecElts(

+ StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc);

- if (TLI.isTypeLegal(Ty) &&

- TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&

- TLI.allowsMemoryAccess(Context, DL, Ty,

- *FirstInChain->getMemOperand(), &IsFast) &&

- IsFast)

- NumStoresToMerge = i + 1;

- }

+ // Remove merged stores for next iteration.

+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);

+ NumConsecutiveStores -= NumElem;

+ }

+ return MadeChange;

- // Check if we found a legal integer type creating a meaningful

- // merge.

- if (NumStoresToMerge < 2) {

- // We know that candidate stores are in order and of correct

- // shape. While there is no mergeable sequence from the

- // beginning one may start later in the sequence. The only

- // reason a merge of size N could have failed where another of

- // the same size would not have, is if the alignment has

- // improved. Drop as many candidates as we can here.

- unsigned NumSkip = 1;

- while (

- (NumSkip < NumConsecutiveStores) &&

- (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))

- NumSkip++;

- StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);

- NumConsecutiveStores -= NumSkip;

- continue;

- }

+bool DAGCombiner::tryStoreMergeOfExtracts(

+ SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,

+ EVT MemVT, SDNode *RootNode) {

+ LLVMContext &Context = *DAG.getContext();

+ const DataLayout &DL = DAG.getDataLayout();

+ unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;

+ bool MadeChange = false;

+ // Loop on Consecutive Stores on success.

+ while (NumConsecutiveStores >= 2) {

+ LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;

+ unsigned FirstStoreAS = FirstInChain->getAddressSpace();

+ unsigned FirstStoreAlign = FirstInChain->getAlignment();

+ unsigned NumStoresToMerge = 1;

+ for (unsigned i = 0; i < NumConsecutiveStores; ++i) {

+ // Find a legal type for the vector store.

+ unsigned Elts = (i + 1) * NumMemElts;

+ EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);

+ bool IsFast = false;

- // Check that we can merge these candidates without causing a cycle.

- if (!checkMergeStoreCandidatesForDependencies(

- StoreNodes, NumStoresToMerge, RootNode)) {

- StoreNodes.erase(StoreNodes.begin(),

- StoreNodes.begin() + NumStoresToMerge);

- NumConsecutiveStores -= NumStoresToMerge;

- continue;

- }

+ // Break early when size is too large to be legal.

+ if (Ty.getSizeInBits() > MaximumLegalStoreInBits)

+ break;

- RV |= MergeStoresOfConstantsOrVecElts(

- StoreNodes, MemVT, NumStoresToMerge, false, true, false);

+ if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&

+ TLI.allowsMemoryAccess(Context, DL, Ty,

+ *FirstInChain->getMemOperand(), &IsFast) &&

+ IsFast)

+ NumStoresToMerge = i + 1;

+ }

+ // Check if we found a legal integer type creating a meaningful

+ // merge.

+ if (NumStoresToMerge < 2) {

+ // We know that candidate stores are in order and of correct

+ // shape. While there is no mergeable sequence from the

+ // beginning one may start later in the sequence. The only

+ // reason a merge of size N could have failed where another of

+ // the same size would not have, is if the alignment has

+ // improved. Drop as many candidates as we can here.

+ unsigned NumSkip = 1;

+ while ((NumSkip < NumConsecutiveStores) &&

+ (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))

+ NumSkip++;

+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);

+ NumConsecutiveStores -= NumSkip;

+ continue;

+ }

- StoreNodes.erase(StoreNodes.begin(),

- StoreNodes.begin() + NumStoresToMerge);

- NumConsecutiveStores -= NumStoresToMerge;

- }

+ // Check that we can merge these candidates without causing a cycle.

+ if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumStoresToMerge,

+ RootNode)) {

+ StoreNodes.erase(StoreNodes.begin(),

+ StoreNodes.begin() + NumStoresToMerge);

+ NumConsecutiveStores -= NumStoresToMerge;

continue;

}

- // Below we handle the case of multiple consecutive stores that

- // come from multiple consecutive loads. We merge them into a single

- // wide load and a single wide store.

+ MadeChange |= mergeStoresOfConstantsOrVecElts(

+ StoreNodes, MemVT, NumStoresToMerge, false, true, false);

+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge);

+ NumConsecutiveStores -= NumStoresToMerge;

+ }

+ return MadeChange;

+bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,

+ unsigned NumConsecutiveStores, EVT MemVT,

+ SDNode *RootNode, bool AllowVectors,

+ bool IsNonTemporalStore,

+ bool IsNonTemporalLoad) {

+ LLVMContext &Context = *DAG.getContext();

+ const DataLayout &DL = DAG.getDataLayout();

+ int64_t ElementSizeBytes = MemVT.getStoreSize();

+ unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;

+ bool MadeChange = false;

- // Look for load nodes which are used by the stored values.

- SmallVector<MemOpLink, 8> LoadNodes;

+ int64_t StartAddress = StoreNodes[0].OffsetFromBase;

- // Find acceptable loads. Loads need to have the same chain (token factor),

- // must not be zext, volatile, indexed, and they must be consecutive.

- BaseIndexOffset LdBasePtr;

+ // Look for load nodes which are used by the stored values.

+ SmallVector<MemOpLink, 8> LoadNodes;

- for (unsigned i = 0; i < NumConsecutiveStores; ++i) {

- StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);

- SDValue Val = peekThroughBitcasts(St->getValue());

- LoadSDNode *Ld = cast<LoadSDNode>(Val);

- BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);

- // If this is not the first ptr that we check.

- int64_t LdOffset = 0;

- if (LdBasePtr.getBase().getNode()) {

- // The base ptr must be the same.

- if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))

- break;

- } else {

- // Check that all other base pointers are the same as this one.

- LdBasePtr = LdPtr;

- }

+ // Find acceptable loads. Loads need to have the same chain (token factor),

+ // must not be zext, volatile, indexed, and they must be consecutive.

+ BaseIndexOffset LdBasePtr;

+ for (unsigned i = 0; i < NumConsecutiveStores; ++i) {

+ StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);

+ SDValue Val = peekThroughBitcasts(St->getValue());

+ LoadSDNode *Ld = cast<LoadSDNode>(Val);

- // We found a potential memory operand to merge.

- LoadNodes.push_back(MemOpLink(Ld, LdOffset));

+ BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);

+ // If this is not the first ptr that we check.

+ int64_t LdOffset = 0;

+ if (LdBasePtr.getBase().getNode()) {

+ // The base ptr must be the same.

+ if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))

+ break;

+ } else {

+ // Check that all other base pointers are the same as this one.

+ LdBasePtr = LdPtr;

}

- while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {

+ // We found a potential memory operand to merge.

+ LoadNodes.push_back(MemOpLink(Ld, LdOffset));

+ }

+ while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {

+ Align RequiredAlignment;

+ bool NeedRotate = false;

+ if (LoadNodes.size() == 2) {

// If we have load/store pair instructions and we only have two values,

// don't bother merging.

- unsigned RequiredAlignment;

- if (LoadNodes.size() == 2 &&

- TLI.hasPairedLoad(MemVT, RequiredAlignment) &&

- StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) {

+ if (TLI.hasPairedLoad(MemVT, RequiredAlignment) &&

+ StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) {

StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);

LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2);

break;

}

- LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;

- unsigned FirstStoreAS = FirstInChain->getAddressSpace();

- unsigned FirstStoreAlign = FirstInChain->getAlignment();

- LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);

- unsigned FirstLoadAlign = FirstLoad->getAlignment();

- // Scan the memory operations on the chain and find the first

- // non-consecutive load memory address. These variables hold the index in

- // the store node array.

- unsigned LastConsecutiveLoad = 1;

- // This variable refers to the size and not index in the array.

- unsigned LastLegalVectorType = 1;

- unsigned LastLegalIntegerType = 1;

- bool isDereferenceable = true;

- bool DoIntegerTruncate = false;

- StartAddress = LoadNodes[0].OffsetFromBase;

- SDValue FirstChain = FirstLoad->getChain();

- for (unsigned i = 1; i < LoadNodes.size(); ++i) {

- // All loads must share the same chain.

- if (LoadNodes[i].MemNode->getChain() != FirstChain)

- break;

+ // If the loads are reversed, see if we can rotate the halves into place.

+ int64_t Offset0 = LoadNodes[0].OffsetFromBase;

+ int64_t Offset1 = LoadNodes[1].OffsetFromBase;

+ EVT PairVT = EVT::getIntegerVT(Context, ElementSizeBytes * 8 * 2);

+ if (Offset0 - Offset1 == ElementSizeBytes &&

+ (hasOperation(ISD::ROTL, PairVT) ||

+ hasOperation(ISD::ROTR, PairVT))) {

+ std::swap(LoadNodes[0], LoadNodes[1]);

+ NeedRotate = true;

+ }

+ LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;

+ unsigned FirstStoreAS = FirstInChain->getAddressSpace();

+ unsigned FirstStoreAlign = FirstInChain->getAlignment();

+ LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);

- int64_t CurrAddress = LoadNodes[i].OffsetFromBase;

- if (CurrAddress - StartAddress != (ElementSizeBytes * i))

- break;

- LastConsecutiveLoad = i;

+ // Scan the memory operations on the chain and find the first

+ // non-consecutive load memory address. These variables hold the index in

+ // the store node array.

+ unsigned LastConsecutiveLoad = 1;

+ // This variable refers to the size and not index in the array.

+ unsigned LastLegalVectorType = 1;

+ unsigned LastLegalIntegerType = 1;

+ bool isDereferenceable = true;

+ bool DoIntegerTruncate = false;

+ StartAddress = LoadNodes[0].OffsetFromBase;

+ SDValue LoadChain = FirstLoad->getChain();

+ for (unsigned i = 1; i < LoadNodes.size(); ++i) {

+ // All loads must share the same chain.

+ if (LoadNodes[i].MemNode->getChain() != LoadChain)

+ break;

- if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())

- isDereferenceable = false;

+ int64_t CurrAddress = LoadNodes[i].OffsetFromBase;

+ if (CurrAddress - StartAddress != (ElementSizeBytes * i))

+ break;

+ LastConsecutiveLoad = i;

- // Find a legal type for the vector store.

- unsigned Elts = (i + 1) * NumMemElts;

- EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);

+ if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())

+ isDereferenceable = false;

- // Break early when size is too large to be legal.

- if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)

- break;

+ // Find a legal type for the vector store.

+ unsigned Elts = (i + 1) * NumMemElts;

+ EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);

- bool IsFastSt, IsFastLd;

- if (TLI.isTypeLegal(StoreTy) &&

- TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&

- TLI.allowsMemoryAccess(Context, DL, StoreTy,

- *FirstInChain->getMemOperand(), &IsFastSt) &&

- IsFastSt &&

- TLI.allowsMemoryAccess(Context, DL, StoreTy,

- *FirstLoad->getMemOperand(), &IsFastLd) &&

- IsFastLd) {

- LastLegalVectorType = i + 1;

- }

+ // Break early when size is too large to be legal.

+ if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)

+ break;

- // Find a legal type for the integer store.

- unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;

- StoreTy = EVT::getIntegerVT(Context, SizeInBits);

- if (TLI.isTypeLegal(StoreTy) &&

- TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&

+ bool IsFastSt = false;

+ bool IsFastLd = false;

+ if (TLI.isTypeLegal(StoreTy) &&

+ TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&

+ TLI.allowsMemoryAccess(Context, DL, StoreTy,

+ *FirstInChain->getMemOperand(), &IsFastSt) &&

+ IsFastSt &&

+ TLI.allowsMemoryAccess(Context, DL, StoreTy,

+ *FirstLoad->getMemOperand(), &IsFastLd) &&

+ IsFastLd) {

+ LastLegalVectorType = i + 1;

+ }

+ // Find a legal type for the integer store.

+ unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;

+ StoreTy = EVT::getIntegerVT(Context, SizeInBits);

+ if (TLI.isTypeLegal(StoreTy) &&

+ TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&

+ TLI.allowsMemoryAccess(Context, DL, StoreTy,

+ *FirstInChain->getMemOperand(), &IsFastSt) &&

+ IsFastSt &&

+ TLI.allowsMemoryAccess(Context, DL, StoreTy,

+ *FirstLoad->getMemOperand(), &IsFastLd) &&

+ IsFastLd) {

+ LastLegalIntegerType = i + 1;

+ DoIntegerTruncate = false;

+ // Or check whether a truncstore and extload is legal.

+ } else if (TLI.getTypeAction(Context, StoreTy) ==

+ TargetLowering::TypePromoteInteger) {

+ EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);

+ if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&

+ TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&

+ TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) &&

+ TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) &&

+ TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&

TLI.allowsMemoryAccess(Context, DL, StoreTy,

*FirstInChain->getMemOperand(), &IsFastSt) &&

IsFastSt &&

@@ -16125,149 +16658,225 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {

*FirstLoad->getMemOperand(), &IsFastLd) &&

IsFastLd) {

LastLegalIntegerType = i + 1;

- DoIntegerTruncate = false;

- // Or check whether a truncstore and extload is legal.

- } else if (TLI.getTypeAction(Context, StoreTy) ==

- TargetLowering::TypePromoteInteger) {

- EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);

- if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&

- TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&

- TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy,

- StoreTy) &&

- TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy,

- StoreTy) &&

- TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&

- TLI.allowsMemoryAccess(Context, DL, StoreTy,

- *FirstInChain->getMemOperand(),

- &IsFastSt) &&

- IsFastSt &&

- TLI.allowsMemoryAccess(Context, DL, StoreTy,

- *FirstLoad->getMemOperand(), &IsFastLd) &&

- IsFastLd) {

- LastLegalIntegerType = i + 1;

- DoIntegerTruncate = true;

- }

+ DoIntegerTruncate = true;

}

+ }

- // Only use vector types if the vector type is larger than the integer

- // type. If they are the same, use integers.

- bool UseVectorTy =

- LastLegalVectorType > LastLegalIntegerType && !NoVectors;

- unsigned LastLegalType =

- std::max(LastLegalVectorType, LastLegalIntegerType);

- // We add +1 here because the LastXXX variables refer to location while

- // the NumElem refers to array/index size.

- unsigned NumElem =

- std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);

- NumElem = std::min(LastLegalType, NumElem);

- if (NumElem < 2) {

- // We know that candidate stores are in order and of correct

- // shape. While there is no mergeable sequence from the

- // beginning one may start later in the sequence. The only

- // reason a merge of size N could have failed where another of

- // the same size would not have is if the alignment or either

- // the load or store has improved. Drop as many candidates as we

- // can here.

- unsigned NumSkip = 1;

- while ((NumSkip < LoadNodes.size()) &&

- (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&

- (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))

- NumSkip++;

- StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);

- LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);

- NumConsecutiveStores -= NumSkip;

- continue;

- }

+ // Only use vector types if the vector type is larger than the integer

+ // type. If they are the same, use integers.

+ bool UseVectorTy =

+ LastLegalVectorType > LastLegalIntegerType && AllowVectors;

+ unsigned LastLegalType =

+ std::max(LastLegalVectorType, LastLegalIntegerType);

+ // We add +1 here because the LastXXX variables refer to location while

+ // the NumElem refers to array/index size.

+ unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);

+ NumElem = std::min(LastLegalType, NumElem);

+ unsigned FirstLoadAlign = FirstLoad->getAlignment();

+ if (NumElem < 2) {

+ // We know that candidate stores are in order and of correct

+ // shape. While there is no mergeable sequence from the

+ // beginning one may start later in the sequence. The only

+ // reason a merge of size N could have failed where another of

+ // the same size would not have is if the alignment or either

+ // the load or store has improved. Drop as many candidates as we

+ // can here.

+ unsigned NumSkip = 1;

+ while ((NumSkip < LoadNodes.size()) &&

+ (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&

+ (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))

+ NumSkip++;

+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);

+ LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);

+ NumConsecutiveStores -= NumSkip;

+ continue;

+ }

- // Check that we can merge these candidates without causing a cycle.

- if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,

- RootNode)) {

- StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);

- LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);

- NumConsecutiveStores -= NumElem;

- continue;

- }

+ // Check that we can merge these candidates without causing a cycle.

+ if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,

+ RootNode)) {

+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);

+ LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);

+ NumConsecutiveStores -= NumElem;

+ continue;

+ }

- // Find if it is better to use vectors or integers to load and store

- // to memory.

- EVT JointMemOpVT;

- if (UseVectorTy) {

- // Find a legal type for the vector store.

- unsigned Elts = NumElem * NumMemElts;

- JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);

- } else {

- unsigned SizeInBits = NumElem * ElementSizeBytes * 8;

- JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);

+ // Find if it is better to use vectors or integers to load and store

+ // to memory.

+ EVT JointMemOpVT;

+ if (UseVectorTy) {

+ // Find a legal type for the vector store.

+ unsigned Elts = NumElem * NumMemElts;

+ JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);

+ } else {

+ unsigned SizeInBits = NumElem * ElementSizeBytes * 8;

+ JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);

+ }

+ SDLoc LoadDL(LoadNodes[0].MemNode);

+ SDLoc StoreDL(StoreNodes[0].MemNode);

+ // The merged loads are required to have the same incoming chain, so

+ // using the first's chain is acceptable.

+ SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);

+ AddToWorklist(NewStoreChain.getNode());

+ MachineMemOperand::Flags LdMMOFlags =

+ isDereferenceable ? MachineMemOperand::MODereferenceable

+ : MachineMemOperand::MONone;

+ if (IsNonTemporalLoad)

+ LdMMOFlags |= MachineMemOperand::MONonTemporal;

+ MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore

+ ? MachineMemOperand::MONonTemporal

+ : MachineMemOperand::MONone;

+ SDValue NewLoad, NewStore;

+ if (UseVectorTy || !DoIntegerTruncate) {

+ NewLoad = DAG.getLoad(

+ JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(),

+ FirstLoad->getPointerInfo(), FirstLoadAlign, LdMMOFlags);

+ SDValue StoreOp = NewLoad;

+ if (NeedRotate) {

+ unsigned LoadWidth = ElementSizeBytes * 8 * 2;

+ assert(JointMemOpVT == EVT::getIntegerVT(Context, LoadWidth) &&

+ "Unexpected type for rotate-able load pair");

+ SDValue RotAmt =

+ DAG.getShiftAmountConstant(LoadWidth / 2, JointMemOpVT, LoadDL);

+ // Target can convert to the identical ROTR if it does not have ROTL.

+ StoreOp = DAG.getNode(ISD::ROTL, LoadDL, JointMemOpVT, NewLoad, RotAmt);

}

+ NewStore = DAG.getStore(

+ NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(),

+ FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags);

+ } else { // This must be the truncstore/extload case

+ EVT ExtendedTy =

+ TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);

+ NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,

+ FirstLoad->getChain(), FirstLoad->getBasePtr(),

+ FirstLoad->getPointerInfo(), JointMemOpVT,

+ FirstLoadAlign, LdMMOFlags);

+ NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,

+ FirstInChain->getBasePtr(),

+ FirstInChain->getPointerInfo(), JointMemOpVT,

+ FirstInChain->getAlignment(),

+ FirstInChain->getMemOperand()->getFlags());

+ }

+ // Transfer chain users from old loads to the new load.

+ for (unsigned i = 0; i < NumElem; ++i) {

+ LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),

+ SDValue(NewLoad.getNode(), 1));

+ }

+ // Replace all stores with the new store. Recursively remove corresponding

+ // values if they are no longer used.

+ for (unsigned i = 0; i < NumElem; ++i) {

+ SDValue Val = StoreNodes[i].MemNode->getOperand(1);

+ CombineTo(StoreNodes[i].MemNode, NewStore);

+ if (Val.getNode()->use_empty())

+ recursivelyDeleteUnusedNodes(Val.getNode());

+ }

+ MadeChange = true;

+ StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);

+ LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);

+ NumConsecutiveStores -= NumElem;

+ }

+ return MadeChange;

+bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) {

+ if (OptLevel == CodeGenOpt::None || !EnableStoreMerging)

+ return false;

- SDLoc LoadDL(LoadNodes[0].MemNode);

- SDLoc StoreDL(StoreNodes[0].MemNode);

- // The merged loads are required to have the same incoming chain, so

- // using the first's chain is acceptable.

- SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);

- AddToWorklist(NewStoreChain.getNode());

- MachineMemOperand::Flags LdMMOFlags =

- isDereferenceable ? MachineMemOperand::MODereferenceable

- : MachineMemOperand::MONone;

- if (IsNonTemporalLoad)

- LdMMOFlags |= MachineMemOperand::MONonTemporal;

- MachineMemOperand::Flags StMMOFlags =

- IsNonTemporalStore ? MachineMemOperand::MONonTemporal

- : MachineMemOperand::MONone;

- SDValue NewLoad, NewStore;

- if (UseVectorTy || !DoIntegerTruncate) {

- NewLoad =

- DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),

- FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),

- FirstLoadAlign, LdMMOFlags);

- NewStore = DAG.getStore(

- NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),

- FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags);

- } else { // This must be the truncstore/extload case

- EVT ExtendedTy =

- TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);

- NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,

- FirstLoad->getChain(), FirstLoad->getBasePtr(),

- FirstLoad->getPointerInfo(), JointMemOpVT,

- FirstLoadAlign, LdMMOFlags);

- NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,

- FirstInChain->getBasePtr(),

- FirstInChain->getPointerInfo(),

- JointMemOpVT, FirstInChain->getAlignment(),

- FirstInChain->getMemOperand()->getFlags());

- }

+ // TODO: Extend this function to merge stores of scalable vectors.

+ // (i.e. two <vscale x 8 x i8> stores can be merged to one <vscale x 16 x i8>

+ // store since we know <vscale x 16 x i8> is exactly twice as large as

+ // <vscale x 8 x i8>). Until then, bail out for scalable vectors.

+ EVT MemVT = St->getMemoryVT();

+ if (MemVT.isScalableVector())

+ return false;

+ if (!MemVT.isSimple() || MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)

+ return false;

- // Transfer chain users from old loads to the new load.

- for (unsigned i = 0; i < NumElem; ++i) {

- LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);

- DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),

- SDValue(NewLoad.getNode(), 1));

- }

+ // This function cannot currently deal with non-byte-sized memory sizes.

+ int64_t ElementSizeBytes = MemVT.getStoreSize();

+ if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits())

+ return false;

- // Replace the all stores with the new store. Recursively remove

- // corresponding value if its no longer used.

- for (unsigned i = 0; i < NumElem; ++i) {

- SDValue Val = StoreNodes[i].MemNode->getOperand(1);

- CombineTo(StoreNodes[i].MemNode, NewStore);

- if (Val.getNode()->use_empty())

- recursivelyDeleteUnusedNodes(Val.getNode());

- }

+ // Do not bother looking at stored values that are not constants, loads, or

+ // extracted vector elements.

+ SDValue StoredVal = peekThroughBitcasts(St->getValue());

+ const StoreSource StoreSrc = getStoreSource(StoredVal);

+ if (StoreSrc == StoreSource::Unknown)

+ return false;

- RV = true;

- StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);

- LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);

- NumConsecutiveStores -= NumElem;

+ SmallVector<MemOpLink, 8> StoreNodes;

+ SDNode *RootNode;

+ // Find potential store merge candidates by searching through chain sub-DAG

+ getStoreMergeCandidates(St, StoreNodes, RootNode);

+ // Check if there is anything to merge.

+ if (StoreNodes.size() < 2)

+ return false;

+ // Sort the memory operands according to their distance from the

+ // base pointer.

+ llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) {

+ return LHS.OffsetFromBase < RHS.OffsetFromBase;

+ });

+ bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute(

+ Attribute::NoImplicitFloat);

+ bool IsNonTemporalStore = St->isNonTemporal();

+ bool IsNonTemporalLoad = StoreSrc == StoreSource::Load &&

+ cast<LoadSDNode>(StoredVal)->isNonTemporal();

+ // Store Merge attempts to merge the lowest stores. This generally

+ // works out as if successful, as the remaining stores are checked

+ // after the first collection of stores is merged. However, in the

+ // case that a non-mergeable store is found first, e.g., {p[-2],

+ // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent

+ // mergeable cases. To prevent this, we prune such stores from the

+ // front of StoreNodes here.

+ bool MadeChange = false;

+ while (StoreNodes.size() > 1) {

+ unsigned NumConsecutiveStores =

+ getConsecutiveStores(StoreNodes, ElementSizeBytes);

+ // There are no more stores in the list to examine.

+ if (NumConsecutiveStores == 0)

+ return MadeChange;

+ // We have at least 2 consecutive stores. Try to merge them.

+ assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores");

+ switch (StoreSrc) {

+ case StoreSource::Constant:

+ MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores,

+ MemVT, RootNode, AllowVectors);

+ break;

+ case StoreSource::Extract:

+ MadeChange |= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores,

+ MemVT, RootNode);

+ break;

+ case StoreSource::Load:

+ MadeChange |= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores,

+ MemVT, RootNode, AllowVectors,

+ IsNonTemporalStore, IsNonTemporalLoad);

+ break;

+ default:

+ llvm_unreachable("Unhandled store source type");

}

- return RV;

+ return MadeChange;

}

SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {

@@ -16408,11 +17017,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {

// Try to infer better alignment information than the store already has.

if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) {

- if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {

- if (Align > ST->getAlignment() && ST->getSrcValueOffset() % Align == 0) {

+ if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {

+ if (*Alignment > ST->getAlign() &&

+ isAligned(*Alignment, ST->getSrcValueOffset())) {

SDValue NewStore =

DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),

- ST->getMemoryVT(), Align,

+ ST->getMemoryVT(), *Alignment,

ST->getMemOperand()->getFlags(), ST->getAAInfo());

// NewStore will always be N as we are only refining the alignment

assert(NewStore.getNode() == N);

@@ -16497,7 +17107,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {

}

if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&

- !ST1->getBasePtr().isUndef()) {

+ !ST1->getBasePtr().isUndef() &&

+ // BaseIndexOffset and the code below requires knowing the size

+ // of a vector, so bail out if MemoryVT is scalable.

+ !ST1->getMemoryVT().isScalableVector()) {

const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG);

const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG);

unsigned STBitSize = ST->getMemoryVT().getSizeInBits();

@@ -16510,33 +17123,6 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {

CombineTo(ST1, ST1->getChain());

return SDValue();

}

- // If ST stores to a subset of preceding store's write set, we may be

- // able to fold ST's value into the preceding stored value. As we know

- // the other uses of ST1's chain are unconcerned with ST, this folding

- // will not affect those nodes.

- int64_t BitOffset;

- if (ChainBase.contains(DAG, ChainBitSize, STBase, STBitSize,

- BitOffset)) {

- SDValue ChainValue = ST1->getValue();

- if (auto *C1 = dyn_cast<ConstantSDNode>(ChainValue)) {

- if (auto *C = dyn_cast<ConstantSDNode>(Value)) {

- APInt Val = C1->getAPIntValue();

- APInt InsertVal = C->getAPIntValue().zextOrTrunc(STBitSize);

- // FIXME: Handle Big-endian mode.

- if (!DAG.getDataLayout().isBigEndian()) {

- Val.insertBits(InsertVal, BitOffset);

- SDValue NewSDVal =

- DAG.getConstant(Val, SDLoc(C), ChainValue.getValueType(),

- C1->isTargetOpcode(), C1->isOpaque());

- SDNode *NewST1 = DAG.UpdateNodeOperands(

- ST1, ST1->getChain(), NewSDVal, ST1->getOperand(2),

- ST1->getOperand(3));

- return CombineTo(ST, SDValue(NewST1, 0));

- }

- } // End ST subset of ST1 case.

}

@@ -16559,7 +17145,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {

// There can be multiple store sequences on the same chain.

// Keep trying to merge store sequences until we are unable to do so

// or until we merge the last store on the chain.

- bool Changed = MergeConsecutiveStores(ST);

+ bool Changed = mergeConsecutiveStores(ST);

if (!Changed) break;

// Return N as merge only uses CombineTo and no worklist clean

// up is necessary.

@@ -16835,6 +17421,10 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {

EVT SubVecVT = SubVec.getValueType();

EVT VT = DestVec.getValueType();

unsigned NumSrcElts = SubVecVT.getVectorNumElements();

+ // If the source only has a single vector element, the cost of creating adding

+ // it to a vector is likely to exceed the cost of a insert_vector_elt.

+ if (NumSrcElts == 1)

+ return SDValue();

unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();

unsigned NumMaskVals = ExtendRatio * NumSrcElts;

@@ -16880,12 +17470,12 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {

SDLoc DL(N);

EVT VT = InVec.getValueType();

- unsigned NumElts = VT.getVectorNumElements();

+ auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);

// Insert into out-of-bounds element is undefined.

- if (auto *IndexC = dyn_cast<ConstantSDNode>(EltNo))

- if (IndexC->getZExtValue() >= VT.getVectorNumElements())

- return DAG.getUNDEF(VT);

+ if (IndexC && VT.isFixedLengthVector() &&

+ IndexC->getZExtValue() >= VT.getVectorNumElements())

+ return DAG.getUNDEF(VT);

// Remove redundant insertions:

// (insert_vector_elt x (extract_vector_elt x idx) idx) -> x

@@ -16893,17 +17483,25 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {

InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))

return InVec;

- auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);

if (!IndexC) {

// If this is variable insert to undef vector, it might be better to splat:

// inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >

if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) {

- SmallVector<SDValue, 8> Ops(NumElts, InVal);

- return DAG.getBuildVector(VT, DL, Ops);

+ if (VT.isScalableVector())

+ return DAG.getSplatVector(VT, DL, InVal);

+ else {

+ SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);

+ return DAG.getBuildVector(VT, DL, Ops);

+ }

}

return SDValue();

}

+ if (VT.isScalableVector())

+ return SDValue();

+ unsigned NumElts = VT.getVectorNumElements();

// We must know which element is being inserted for folds below here.

unsigned Elt = IndexC->getZExtValue();

if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))

@@ -16968,11 +17566,12 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,

EVT ResultVT = EVE->getValueType(0);

EVT VecEltVT = InVecVT.getVectorElementType();

- unsigned Align = OriginalLoad->getAlignment();

- unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(

+ Align Alignment = OriginalLoad->getAlign();

+ Align NewAlign = DAG.getDataLayout().getABITypeAlign(

VecEltVT.getTypeForEVT(*DAG.getContext()));

- if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))

+ if (NewAlign > Alignment ||

+ !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))

return SDValue();

ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?

@@ -16980,7 +17579,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,

if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))

return SDValue();

- Align = NewAlign;

+ Alignment = NewAlign;

SDValue NewPtr = OriginalLoad->getBasePtr();

SDValue Offset;

@@ -17020,13 +17619,13 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,

: ISD::EXTLOAD;

Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,

OriginalLoad->getChain(), NewPtr, MPI, VecEltVT,

- Align, OriginalLoad->getMemOperand()->getFlags(),

+ Alignment, OriginalLoad->getMemOperand()->getFlags(),

OriginalLoad->getAAInfo());

Chain = Load.getValue(1);

} else {

- Load = DAG.getLoad(VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr,

- MPI, Align, OriginalLoad->getMemOperand()->getFlags(),

- OriginalLoad->getAAInfo());

+ Load = DAG.getLoad(

+ VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, Alignment,

+ OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo());

Chain = Load.getValue(1);

if (ResultVT.bitsLT(VecEltVT))

Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);

@@ -17102,6 +17701,10 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {

// (vextract (scalar_to_vector val, 0) -> val

if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) {

+ // Only 0'th element of SCALAR_TO_VECTOR is defined.

+ if (DAG.isKnownNeverZero(Index))

+ return DAG.getUNDEF(ScalarVT);

// Check if the result type doesn't match the inserted element type. A

// SCALAR_TO_VECTOR may truncate the inserted element and the

// EXTRACT_VECTOR_ELT may widen the extracted vector.

@@ -17115,15 +17718,21 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {

// extract_vector_elt of out-of-bounds element -> UNDEF

auto *IndexC = dyn_cast<ConstantSDNode>(Index);

- unsigned NumElts = VecVT.getVectorNumElements();

- if (IndexC && IndexC->getAPIntValue().uge(NumElts))

+ if (IndexC && VecVT.isFixedLengthVector() &&

+ IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))

return DAG.getUNDEF(ScalarVT);

// extract_vector_elt (build_vector x, y), 1 -> y

- if (IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR &&

+ if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||

+ VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&

TLI.isTypeLegal(VecVT) &&

(VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) {

- SDValue Elt = VecOp.getOperand(IndexC->getZExtValue());

+ assert((VecOp.getOpcode() != ISD::BUILD_VECTOR ||

+ VecVT.isFixedLengthVector()) &&

+ "BUILD_VECTOR used for scalable vectors");

+ unsigned IndexVal =

+ VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0;

+ SDValue Elt = VecOp.getOperand(IndexVal);

EVT InEltVT = Elt.getValueType();

// Sometimes build_vector's scalar input types do not match result type.

@@ -17134,6 +17743,15 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {

// converts.

}

+ if (VecVT.isScalableVector())

+ return SDValue();

+ // All the code from this point onwards assumes fixed width vectors, but it's

+ // possible that some of the combinations could be made to work for scalable

+ // vectors too.

+ unsigned NumElts = VecVT.getVectorNumElements();

+ unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();

// TODO: These transforms should not require the 'hasOneUse' restriction, but

// there are regressions on multiple targets without it. We can end up with a

// mess of scalar and vector code if we reduce only part of the DAG to scalar.

@@ -17157,7 +17775,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {

"Extract element and scalar to vector can't change element type "

"from FP to integer.");

unsigned XBitWidth = X.getValueSizeInBits();

- unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();

BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;

// An extract element return value type can be wider than its vector

@@ -17215,9 +17832,8 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {

// FIXME: Should really be just isOperationLegalOrCustom.

TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) ||

TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {

- EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,

- DAG.getConstant(OrigElt, DL, IndexTy));

+ DAG.getVectorIdxConstant(OrigElt, DL));

}

@@ -17241,6 +17857,14 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {

AddToWorklist(N);

return SDValue(N, 0);

}

+ APInt DemandedBits = APInt::getAllOnesValue(VecEltBitWidth);

+ if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) {

+ // We simplified the vector operand of this extract element. If this

+ // extract is not dead, visit it again so it is folded properly.

+ if (N->getOpcode() != ISD::DELETED_NODE)

+ AddToWorklist(N);

+ return SDValue(N, 0);

+ }

}

// Everything under here is trying to match an extract of a loaded value.

@@ -17326,6 +17950,30 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {

Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts;

Index = DAG.getConstant(Elt, DL, Index.getValueType());

}

+ } else if (VecOp.getOpcode() == ISD::CONCAT_VECTORS && !BCNumEltsChanged &&

+ VecVT.getVectorElementType() == ScalarVT &&

+ (!LegalTypes ||

+ TLI.isTypeLegal(

+ VecOp.getOperand(0).getValueType().getVectorElementType()))) {

+ // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 0

+ // -> extract_vector_elt a, 0

+ // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 1

+ // -> extract_vector_elt a, 1

+ // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 2

+ // -> extract_vector_elt b, 0

+ // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3

+ // -> extract_vector_elt b, 1

+ SDLoc SL(N);

+ EVT ConcatVT = VecOp.getOperand(0).getValueType();

+ unsigned ConcatNumElts = ConcatVT.getVectorNumElements();

+ SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL,

+ Index.getValueType());

+ SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts);

+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL,

+ ConcatVT.getVectorElementType(),

+ ConcatOp, NewIdx);

+ return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt);

}

// Make sure we found a non-volatile load and the extractelement is

@@ -17407,6 +18055,11 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {

if (!ValidTypes)

return SDValue();

+ // If we already have a splat buildvector, then don't fold it if it means

+ // introducing zeros.

+ if (!AllAnyExt && DAG.isSplatValue(SDValue(N, 0), /*AllowUndefs*/ true))

+ return SDValue();

bool isLE = DAG.getDataLayout().isLittleEndian();

unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();

assert(ElemRatio > 1 && "Invalid element size ratio");

@@ -17453,12 +18106,89 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {

return DAG.getBitcast(VT, BV);

}

+// Simplify (build_vec (trunc $1)

+// (trunc (srl $1 half-width))

+// (trunc (srl $1 (2 * half-width))) …)

+// to (bitcast $1)

+SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) {

+ assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");

+ // Only for little endian

+ if (!DAG.getDataLayout().isLittleEndian())

+ return SDValue();

+ SDLoc DL(N);

+ EVT VT = N->getValueType(0);

+ EVT OutScalarTy = VT.getScalarType();

+ uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits();

+ // Only for power of two types to be sure that bitcast works well

+ if (!isPowerOf2_64(ScalarTypeBitsize))

+ return SDValue();

+ unsigned NumInScalars = N->getNumOperands();

+ // Look through bitcasts

+ auto PeekThroughBitcast = [](SDValue Op) {

+ if (Op.getOpcode() == ISD::BITCAST)

+ return Op.getOperand(0);

+ return Op;

+ };

+ // The source value where all the parts are extracted.

+ SDValue Src;

+ for (unsigned i = 0; i != NumInScalars; ++i) {

+ SDValue In = PeekThroughBitcast(N->getOperand(i));

+ // Ignore undef inputs.

+ if (In.isUndef()) continue;

+ if (In.getOpcode() != ISD::TRUNCATE)

+ return SDValue();

+ In = PeekThroughBitcast(In.getOperand(0));

+ if (In.getOpcode() != ISD::SRL) {

+ // For now only build_vec without shuffling, handle shifts here in the

+ // future.

+ if (i != 0)

+ return SDValue();

+ Src = In;

+ } else {

+ // In is SRL

+ SDValue part = PeekThroughBitcast(In.getOperand(0));

+ if (!Src) {

+ Src = part;

+ } else if (Src != part) {

+ // Vector parts do not stem from the same variable

+ return SDValue();

+ }

+ SDValue ShiftAmtVal = In.getOperand(1);

+ if (!isa<ConstantSDNode>(ShiftAmtVal))

+ return SDValue();

+ uint64_t ShiftAmt = In.getNode()->getConstantOperandVal(1);

+ // The extracted value is not extracted at the right position

+ if (ShiftAmt != i * ScalarTypeBitsize)

+ return SDValue();

+ }

+ // Only cast if the size is the same

+ if (Src.getValueType().getSizeInBits() != VT.getSizeInBits())

+ return SDValue();

+ return DAG.getBitcast(VT, Src);

SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,

ArrayRef<int> VectorMask,

SDValue VecIn1, SDValue VecIn2,

unsigned LeftIdx, bool DidSplitVec) {

- MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());

- SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy);

+ SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);

EVT VT = N->getValueType(0);

EVT InVT1 = VecIn1.getValueType();

@@ -17492,7 +18222,7 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,

// If we only have one input vector, and it's twice the size of the

// output, split it in two.

VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1,

- DAG.getConstant(NumElems, DL, IdxTy));

+ DAG.getVectorIdxConstant(NumElems, DL));

VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx);

// Since we now have shorter input vectors, adjust the offset of the

// second vector's start.

@@ -17699,6 +18429,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {

return SDValue();

SDValue ExtractedFromVec = Op.getOperand(0);

+ if (ExtractedFromVec.getValueType().isScalableVector())

+ return SDValue();

const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);

if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))

return SDValue();

@@ -17733,7 +18466,6 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {

unsigned NearestPow2 = 0;

SDValue Vec = VecIn.back();

EVT InVT = Vec.getValueType();

- MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());

SmallVector<unsigned, 8> IndexVec(NumElems, 0);

for (unsigned i = 0; i < NumElems; i++) {

@@ -17752,9 +18484,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {

InVT.getVectorElementType(), SplitSize);

if (TLI.isTypeLegal(SplitVT)) {

SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,

- DAG.getConstant(SplitSize, DL, IdxTy));

+ DAG.getVectorIdxConstant(SplitSize, DL));

SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,

- DAG.getConstant(0, DL, IdxTy));

+ DAG.getVectorIdxConstant(0, DL));

VecIn.pop_back();

VecIn.push_back(VecIn1);

VecIn.push_back(VecIn2);

@@ -17986,6 +18718,9 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {

if (SDValue V = reduceBuildVecExtToExtBuildVec(N))

return V;

+ if (SDValue V = reduceBuildVecTruncToBitCast(N))

+ return V;

if (SDValue V = reduceBuildVecToShuffle(N))

return V;

@@ -18080,6 +18815,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {

// What vector are we extracting the subvector from and at what index?

SDValue ExtVec = Op.getOperand(0);

+ int ExtIdx = Op.getConstantOperandVal(1);

// We want the EVT of the original extraction to correctly scale the

// extraction index.

@@ -18092,10 +18828,6 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {

continue;

}

- if (!isa<ConstantSDNode>(Op.getOperand(1)))

- return SDValue();

- int ExtIdx = Op.getConstantOperandVal(1);

// Ensure that we are extracting a subvector from a vector the same

// size as the result.

if (ExtVT.getSizeInBits() != VT.getSizeInBits())

@@ -18129,6 +18861,69 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {

DAG.getBitcast(VT, SV1), Mask, DAG);

}

+static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) {

+ unsigned CastOpcode = N->getOperand(0).getOpcode();

+ switch (CastOpcode) {

+ case ISD::SINT_TO_FP:

+ case ISD::UINT_TO_FP:

+ case ISD::FP_TO_SINT:

+ case ISD::FP_TO_UINT:

+ // TODO: Allow more opcodes?

+ // case ISD::BITCAST:

+ // case ISD::TRUNCATE:

+ // case ISD::ZERO_EXTEND:

+ // case ISD::SIGN_EXTEND:

+ // case ISD::FP_EXTEND:

+ break;

+ default:

+ return SDValue();

+ }

+ EVT SrcVT = N->getOperand(0).getOperand(0).getValueType();

+ if (!SrcVT.isVector())

+ return SDValue();

+ // All operands of the concat must be the same kind of cast from the same

+ // source type.

+ SmallVector<SDValue, 4> SrcOps;

+ for (SDValue Op : N->ops()) {

+ if (Op.getOpcode() != CastOpcode || !Op.hasOneUse() ||

+ Op.getOperand(0).getValueType() != SrcVT)

+ return SDValue();

+ SrcOps.push_back(Op.getOperand(0));

+ }

+ // The wider cast must be supported by the target. This is unusual because

+ // the operation support type parameter depends on the opcode. In addition,

+ // check the other type in the cast to make sure this is really legal.

+ EVT VT = N->getValueType(0);

+ EVT SrcEltVT = SrcVT.getVectorElementType();

+ unsigned NumElts = SrcVT.getVectorElementCount().Min * N->getNumOperands();

+ EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts);

+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();

+ switch (CastOpcode) {

+ case ISD::SINT_TO_FP:

+ case ISD::UINT_TO_FP:

+ if (!TLI.isOperationLegalOrCustom(CastOpcode, ConcatSrcVT) ||

+ !TLI.isTypeLegal(VT))

+ return SDValue();

+ break;

+ case ISD::FP_TO_SINT:

+ case ISD::FP_TO_UINT:

+ if (!TLI.isOperationLegalOrCustom(CastOpcode, VT) ||

+ !TLI.isTypeLegal(ConcatSrcVT))

+ return SDValue();

+ break;

+ default:

+ llvm_unreachable("Unexpected cast opcode");

+ }

+ // concat (cast X), (cast Y)... -> cast (concat X, Y...)

+ SDLoc DL(N);

+ SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatSrcVT, SrcOps);

+ return DAG.getNode(CastOpcode, DL, VT, NewConcat);

SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {

// If we only have one input vector, we don't need to do any concatenation.

if (N->getNumOperands() == 1)

@@ -18256,6 +19051,9 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {

if (SDValue V = combineConcatVectorOfExtracts(N, DAG))

return V;

+ if (SDValue V = combineConcatVectorOfCasts(N, DAG))

+ return V;

// Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR

// nodes often generate nop CONCAT_VECTOR nodes.

// Scan the CONCAT_VECTOR operands and look for a CONCAT operations that

@@ -18287,14 +19085,9 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {

return SDValue();

}

- auto *CS = dyn_cast<ConstantSDNode>(Op.getOperand(1));

- // The extract index must be constant.

- if (!CS)

- return SDValue();

// Check that we are reading from the identity index.

unsigned IdentityIndex = i * PartNumElem;

- if (CS->getAPIntValue() != IdentityIndex)

+ if (Op.getConstantOperandAPInt(1) != IdentityIndex)

return SDValue();

}

@@ -18377,6 +19170,15 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {

if (!TLI.isBinOp(BOpcode) || BinOp.getNode()->getNumValues() != 1)

return SDValue();

+ // Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be

+ // reduced to the unary fneg when it is visited, and we probably want to deal

+ // with fneg in a target-specific way.

+ if (BOpcode == ISD::FSUB) {

+ auto *C = isConstOrConstSplatFP(BinOp.getOperand(0), /*AllowUndefs*/ true);

+ if (C && C->getValueAPF().isNegZero())

+ return SDValue();

+ }

// The binop must be a vector type, so we can extract some fraction of it.

EVT WideBVT = BinOp.getValueType();

if (!WideBVT.isVector())

@@ -18412,12 +19214,11 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {

// bitcasted.

unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements();

unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();

- EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();

if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) &&

BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) {

// extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)

SDLoc DL(Extract);

- SDValue NewExtIndex = DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT);

+ SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL);

SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,

BinOp.getOperand(0), NewExtIndex);

SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,

@@ -18457,7 +19258,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {

// extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC)

// extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN

SDLoc DL(Extract);

- SDValue IndexC = DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT);

+ SDValue IndexC = DAG.getVectorIdxConstant(ExtBOIdx, DL);

SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL)

: DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,

BinOp.getOperand(0), IndexC);

@@ -18489,6 +19290,26 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {

// Allow targets to opt-out.

EVT VT = Extract->getValueType(0);

+ // We can only create byte sized loads.

+ if (!VT.isByteSized())

+ return SDValue();

+ unsigned Index = ExtIdx->getZExtValue();

+ unsigned NumElts = VT.getVectorNumElements();

+ // If the index is a multiple of the extract element count, we can offset the

+ // address by the store size multiplied by the subvector index. Otherwise if

+ // the scalar type is byte sized, we can just use the index multiplied by

+ // the element size in bytes as the offset.

+ unsigned Offset;

+ if (Index % NumElts == 0)

+ Offset = (Index / NumElts) * VT.getStoreSize();

+ else if (VT.getScalarType().isByteSized())

+ Offset = Index * VT.getScalarType().getStoreSize();

+ else

+ return SDValue();

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))

return SDValue();

@@ -18496,8 +19317,7 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {

// The narrow load will be offset from the base address of the old load if

// we are extracting from something besides index 0 (little-endian).

SDLoc DL(Extract);

- SDValue BaseAddr = Ld->getOperand(1);

- unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize();

+ SDValue BaseAddr = Ld->getBasePtr();

// TODO: Use "BaseIndexOffset" to make this more effective.

SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);

@@ -18512,6 +19332,7 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {

SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {

EVT NVT = N->getValueType(0);

SDValue V = N->getOperand(0);

+ uint64_t ExtIdx = N->getConstantOperandVal(1);

// Extract from UNDEF is UNDEF.

if (V.isUndef())

@@ -18523,9 +19344,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {

// Combine an extract of an extract into a single extract_subvector.

// ext (ext X, C), 0 --> ext X, C

- SDValue Index = N->getOperand(1);

- if (isNullConstant(Index) && V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

- V.hasOneUse() && isa<ConstantSDNode>(V.getOperand(1))) {

+ if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {

if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),

V.getConstantOperandVal(1)) &&

TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {

@@ -18536,21 +19355,20 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {

// Try to move vector bitcast after extract_subv by scaling extraction index:

// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')

- if (isa<ConstantSDNode>(Index) && V.getOpcode() == ISD::BITCAST &&

+ if (V.getOpcode() == ISD::BITCAST &&

V.getOperand(0).getValueType().isVector()) {

SDValue SrcOp = V.getOperand(0);

EVT SrcVT = SrcOp.getValueType();

- unsigned SrcNumElts = SrcVT.getVectorNumElements();

- unsigned DestNumElts = V.getValueType().getVectorNumElements();

+ unsigned SrcNumElts = SrcVT.getVectorMinNumElements();

+ unsigned DestNumElts = V.getValueType().getVectorMinNumElements();

if ((SrcNumElts % DestNumElts) == 0) {

unsigned SrcDestRatio = SrcNumElts / DestNumElts;

- unsigned NewExtNumElts = NVT.getVectorNumElements() * SrcDestRatio;

+ ElementCount NewExtEC = NVT.getVectorElementCount() * SrcDestRatio;

EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),

- NewExtNumElts);

+ NewExtEC);

if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {

- unsigned IndexValScaled = N->getConstantOperandVal(1) * SrcDestRatio;

SDLoc DL(N);

- SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);

+ SDValue NewIndex = DAG.getVectorIdxConstant(ExtIdx * SrcDestRatio, DL);

SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,

V.getOperand(0), NewIndex);

return DAG.getBitcast(NVT, NewExtract);

@@ -18558,34 +19376,43 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {

}

if ((DestNumElts % SrcNumElts) == 0) {

unsigned DestSrcRatio = DestNumElts / SrcNumElts;

- if ((NVT.getVectorNumElements() % DestSrcRatio) == 0) {

- unsigned NewExtNumElts = NVT.getVectorNumElements() / DestSrcRatio;

- EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),

- SrcVT.getScalarType(), NewExtNumElts);

- if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&

- TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {

- unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;

+ if ((NVT.getVectorMinNumElements() % DestSrcRatio) == 0) {

+ ElementCount NewExtEC = NVT.getVectorElementCount() / DestSrcRatio;

+ EVT ScalarVT = SrcVT.getScalarType();

+ if ((ExtIdx % DestSrcRatio) == 0) {

SDLoc DL(N);

- SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);

- SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,

- V.getOperand(0), NewIndex);

- return DAG.getBitcast(NVT, NewExtract);

+ unsigned IndexValScaled = ExtIdx / DestSrcRatio;

+ EVT NewExtVT =

+ EVT::getVectorVT(*DAG.getContext(), ScalarVT, NewExtEC);

+ if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {

+ SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);

+ SDValue NewExtract =

+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,

+ V.getOperand(0), NewIndex);

+ return DAG.getBitcast(NVT, NewExtract);

+ }

+ if (NewExtEC == 1 &&

+ TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) {

+ SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);

+ SDValue NewExtract =

+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,

+ V.getOperand(0), NewIndex);

+ return DAG.getBitcast(NVT, NewExtract);

+ }

}

- if (V.getOpcode() == ISD::CONCAT_VECTORS && isa<ConstantSDNode>(Index)) {

+ if (V.getOpcode() == ISD::CONCAT_VECTORS) {

+ unsigned ExtNumElts = NVT.getVectorMinNumElements();

EVT ConcatSrcVT = V.getOperand(0).getValueType();

assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() &&

"Concat and extract subvector do not change element type");

- unsigned ExtIdx = N->getConstantOperandVal(1);

- unsigned ExtNumElts = NVT.getVectorNumElements();

- assert(ExtIdx % ExtNumElts == 0 &&

+ assert((ExtIdx % ExtNumElts) == 0 &&

"Extract index is not a multiple of the input vector length.");

- unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorNumElements();

+ unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements();

unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts;

// If the concatenated source types match this extract, it's a direct

@@ -18599,15 +19426,14 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {

// concat operand. Example:

// v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->

// v2i8 extract_subvec v8i8 Y, 6

- if (ConcatSrcNumElts % ExtNumElts == 0) {

+ if (NVT.isFixedLengthVector() && ConcatSrcNumElts % ExtNumElts == 0) {

SDLoc DL(N);

unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;

assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&

"Trying to extract from >1 concat operand?");

assert(NewExtIdx % ExtNumElts == 0 &&

"Extract index is not a multiple of the input vector length.");

- MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());

- SDValue NewIndexC = DAG.getConstant(NewExtIdx, DL, IdxTy);

+ SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL);

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,

V.getOperand(ConcatOpIdx), NewIndexC);

}

@@ -18617,37 +19443,33 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {

// If the input is a build vector. Try to make a smaller build vector.

if (V.getOpcode() == ISD::BUILD_VECTOR) {

- if (auto *IdxC = dyn_cast<ConstantSDNode>(Index)) {

- EVT InVT = V.getValueType();

- unsigned ExtractSize = NVT.getSizeInBits();

- unsigned EltSize = InVT.getScalarSizeInBits();

- // Only do this if we won't split any elements.

- if (ExtractSize % EltSize == 0) {

- unsigned NumElems = ExtractSize / EltSize;

- EVT EltVT = InVT.getVectorElementType();

- EVT ExtractVT = NumElems == 1 ? EltVT

- : EVT::getVectorVT(*DAG.getContext(),

- EltVT, NumElems);

- if ((Level < AfterLegalizeDAG ||

- (NumElems == 1 ||

- TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&

- (!LegalTypes || TLI.isTypeLegal(ExtractVT))) {

- unsigned IdxVal = IdxC->getZExtValue();

- IdxVal *= NVT.getScalarSizeInBits();

- IdxVal /= EltSize;

- if (NumElems == 1) {

- SDValue Src = V->getOperand(IdxVal);

- if (EltVT != Src.getValueType())

- Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src);

- return DAG.getBitcast(NVT, Src);

- }

- // Extract the pieces from the original build_vector.

- SDValue BuildVec = DAG.getBuildVector(

- ExtractVT, SDLoc(N), V->ops().slice(IdxVal, NumElems));

- return DAG.getBitcast(NVT, BuildVec);

+ EVT InVT = V.getValueType();

+ unsigned ExtractSize = NVT.getSizeInBits();

+ unsigned EltSize = InVT.getScalarSizeInBits();

+ // Only do this if we won't split any elements.

+ if (ExtractSize % EltSize == 0) {

+ unsigned NumElems = ExtractSize / EltSize;

+ EVT EltVT = InVT.getVectorElementType();

+ EVT ExtractVT =

+ NumElems == 1 ? EltVT

+ : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems);

+ if ((Level < AfterLegalizeDAG ||

+ (NumElems == 1 ||

+ TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&

+ (!LegalTypes || TLI.isTypeLegal(ExtractVT))) {

+ unsigned IdxVal = (ExtIdx * NVT.getScalarSizeInBits()) / EltSize;

+ if (NumElems == 1) {

+ SDValue Src = V->getOperand(IdxVal);

+ if (EltVT != Src.getValueType())

+ Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src);

+ return DAG.getBitcast(NVT, Src);

}

+ // Extract the pieces from the original build_vector.

+ SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),

+ V->ops().slice(IdxVal, NumElems));

+ return DAG.getBitcast(NVT, BuildVec);

}

@@ -18659,23 +19481,19 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {

if (!NVT.bitsEq(SmallVT))

return SDValue();

- // Only handle cases where both indexes are constants.

- auto *ExtIdx = dyn_cast<ConstantSDNode>(Index);

- auto *InsIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));

- if (InsIdx && ExtIdx) {

- // Combine:

- // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)

- // Into:

- // indices are equal or bit offsets are equal => V1

- // otherwise => (extract_subvec V1, ExtIdx)

- if (InsIdx->getZExtValue() * SmallVT.getScalarSizeInBits() ==

- ExtIdx->getZExtValue() * NVT.getScalarSizeInBits())

- return DAG.getBitcast(NVT, V.getOperand(1));

- return DAG.getNode(

- ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,

- DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),

- Index);

- }

+ // Combine:

+ // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)

+ // Into:

+ // indices are equal or bit offsets are equal => V1

+ // otherwise => (extract_subvec V1, ExtIdx)

+ uint64_t InsIdx = V.getConstantOperandVal(2);

+ if (InsIdx * SmallVT.getScalarSizeInBits() ==

+ ExtIdx * NVT.getScalarSizeInBits())

+ return DAG.getBitcast(NVT, V.getOperand(1));

+ return DAG.getNode(

+ ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,

+ DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),

+ N->getOperand(1));

}

if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))

@@ -19064,6 +19882,57 @@ static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf,

NewMask);

}

+/// Combine shuffle of shuffle of the form:

+/// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X

+static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf,

+ SelectionDAG &DAG) {

+ if (!OuterShuf->getOperand(1).isUndef())

+ return SDValue();

+ auto *InnerShuf = dyn_cast<ShuffleVectorSDNode>(OuterShuf->getOperand(0));

+ if (!InnerShuf || !InnerShuf->getOperand(1).isUndef())

+ return SDValue();

+ ArrayRef<int> OuterMask = OuterShuf->getMask();

+ ArrayRef<int> InnerMask = InnerShuf->getMask();

+ unsigned NumElts = OuterMask.size();

+ assert(NumElts == InnerMask.size() && "Mask length mismatch");

+ SmallVector<int, 32> CombinedMask(NumElts, -1);

+ int SplatIndex = -1;

+ for (unsigned i = 0; i != NumElts; ++i) {

+ // Undef lanes remain undef.

+ int OuterMaskElt = OuterMask[i];

+ if (OuterMaskElt == -1)

+ continue;

+ // Peek through the shuffle masks to get the underlying source element.

+ int InnerMaskElt = InnerMask[OuterMaskElt];

+ if (InnerMaskElt == -1)

+ continue;

+ // Initialize the splatted element.

+ if (SplatIndex == -1)

+ SplatIndex = InnerMaskElt;

+ // Non-matching index - this is not a splat.

+ if (SplatIndex != InnerMaskElt)

+ return SDValue();

+ CombinedMask[i] = InnerMaskElt;

+ }

+ assert((all_of(CombinedMask, [](int M) { return M == -1; }) ||

+ getSplatIndex(CombinedMask) != -1) &&

+ "Expected a splat mask");

+ // TODO: The transform may be a win even if the mask is not legal.

+ EVT VT = OuterShuf->getValueType(0);

+ assert(VT == InnerShuf->getValueType(0) && "Expected matching shuffle types");

+ if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(CombinedMask, VT))

+ return SDValue();

+ return DAG.getVectorShuffle(VT, SDLoc(OuterShuf), InnerShuf->getOperand(0),

+ InnerShuf->getOperand(1), CombinedMask);

/// If the shuffle mask is taking exactly one element from the first vector

/// operand and passing through all other elements from the second vector

/// operand, return the index of the mask element that is choosing an element

@@ -19136,8 +20005,7 @@ static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,

// element used. Therefore, our new insert element occurs at the shuffle's

// mask index value, not the insert's index value.

// shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'

- SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),

- Op0.getOperand(2).getValueType());

+ SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf));

return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),

Op1, Op0.getOperand(1), NewInsIndex);

}

@@ -19223,6 +20091,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {

if (SDValue V = combineShuffleOfSplatVal(SVN, DAG))

return V;

+ if (SDValue V = formSplatFromShuffles(SVN, DAG))

+ return V;

// If it is a splat, check if the argument vector is another splat or a

// build_vector.

if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {

@@ -19234,7 +20105,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {

SDValue L = N0.getOperand(0), R = N0.getOperand(1);

SDLoc DL(N);

EVT EltVT = VT.getScalarType();

- SDValue Index = DAG.getIntPtrConstant(SplatIndex, DL);

+ SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL);

SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);

SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);

SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR,

@@ -19354,16 +20225,6 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {

if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&

N1.isUndef() && Level < AfterLegalizeVectorOps &&

TLI.isTypeLegal(VT)) {

- auto ScaleShuffleMask = [](ArrayRef<int> Mask, int Scale) {

- if (Scale == 1)

- return SmallVector<int, 8>(Mask.begin(), Mask.end());

- SmallVector<int, 8> NewMask;

- for (int M : Mask)

- for (int s = 0; s != Scale; ++s)

- NewMask.push_back(M < 0 ? -1 : Scale * M + s);

- return NewMask;

- };

SDValue BC0 = peekThroughOneUseBitcasts(N0);

if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {

@@ -19383,10 +20244,10 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {

// Scale the shuffle masks to the smaller scalar type.

ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);

- SmallVector<int, 8> InnerMask =

- ScaleShuffleMask(InnerSVN->getMask(), InnerScale);

- SmallVector<int, 8> OuterMask =

- ScaleShuffleMask(SVN->getMask(), OuterScale);

+ SmallVector<int, 8> InnerMask;

+ SmallVector<int, 8> OuterMask;

+ narrowShuffleMaskElts(InnerScale, InnerSVN->getMask(), InnerMask);

+ narrowShuffleMaskElts(OuterScale, SVN->getMask(), OuterMask);

// Merge the shuffle masks.

SmallVector<int, 8> NewMask;

@@ -19547,7 +20408,9 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {

// Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern

// with a VECTOR_SHUFFLE and possible truncate.

- if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

+ if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

+ VT.isFixedLengthVector() &&

+ InVal->getOperand(0).getValueType().isFixedLengthVector()) {

SDValue InVec = InVal->getOperand(0);

SDValue EltNo = InVal->getOperand(1);

auto InVecT = InVec.getValueType();

@@ -19576,11 +20439,10 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {

return LegalShuffle;

// If not we must truncate the vector.

if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {

- MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());

- SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy);

- EVT SubVT =

- EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(),

- VT.getVectorNumElements());

+ SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(N));

+ EVT SubVT = EVT::getVectorVT(*DAG.getContext(),

+ InVecT.getVectorElementType(),

+ VT.getVectorNumElements());

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT,

LegalShuffle, ZeroIdx);

}

@@ -19597,6 +20459,7 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

SDValue N2 = N->getOperand(2);

+ uint64_t InsIdx = N->getConstantOperandVal(2);

// If inserting an UNDEF, just return the original vector.

if (N1.isUndef())

@@ -19657,11 +20520,6 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {

return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0,

N1.getOperand(1), N2);

- if (!isa<ConstantSDNode>(N2))

- return SDValue();

- uint64_t InsIdx = cast<ConstantSDNode>(N2)->getZExtValue();

// Push subvector bitcasts to the output, adjusting the index as we go.

// insert_subvector(bitcast(v), bitcast(s), c1)

// -> bitcast(insert_subvector(v, s, c2))

@@ -19676,19 +20534,18 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {

EVT NewVT;

SDLoc DL(N);

SDValue NewIdx;

- MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());

LLVMContext &Ctx = *DAG.getContext();

unsigned NumElts = VT.getVectorNumElements();

unsigned EltSizeInBits = VT.getScalarSizeInBits();

if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) {

unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits();

NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale);

- NewIdx = DAG.getConstant(InsIdx * Scale, DL, IdxVT);

+ NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL);

} else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) {

unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits;

if ((NumElts % Scale) == 0 && (InsIdx % Scale) == 0) {

NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts / Scale);

- NewIdx = DAG.getConstant(InsIdx / Scale, DL, IdxVT);

+ NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL);

}

if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) {

@@ -19704,8 +20561,7 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {

// (insert_subvector (insert_subvector A, Idx0), Idx1)

// -> (insert_subvector (insert_subvector A, Idx1), Idx0)

if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&

- N1.getValueType() == N0.getOperand(1).getValueType() &&

- isa<ConstantSDNode>(N0.getOperand(2))) {

+ N1.getValueType() == N0.getOperand(1).getValueType()) {

unsigned OtherIdx = N0.getConstantOperandVal(2);

if (InsIdx < OtherIdx) {

// Swap nodes.

@@ -19722,10 +20578,8 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {

if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&

N0.getOperand(0).getValueType() == N1.getValueType()) {

unsigned Factor = N1.getValueType().getVectorNumElements();

SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());

- Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1;

+ Ops[InsIdx / Factor] = N1;

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);

}

@@ -19769,9 +20623,9 @@ SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {

// VECREDUCE over 1-element vector is just an extract.

if (VT.getVectorNumElements() == 1) {

SDLoc dl(N);

- SDValue Res = DAG.getNode(

- ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,

- DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));

+ SDValue Res =

+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,

+ DAG.getVectorIdxConstant(0, dl));

if (Res.getValueType() != N->getValueType(0))

Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res);

return Res;

@@ -19904,10 +20758,9 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) {

return SDValue();

SDLoc DL(N);

- SDValue IndexC =

- DAG.getConstant(Index0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()));

- SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, N0, IndexC);

- SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, N1, IndexC);

+ SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);

+ SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);

+ SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);

SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());

// If all lanes but 1 are undefined, no need to splat the scalar result.

@@ -19937,6 +20790,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {

SDValue Ops[] = {LHS, RHS};

EVT VT = N->getValueType(0);

unsigned Opcode = N->getOpcode();

+ SDNodeFlags Flags = N->getFlags();

// See if we can constant fold the vector operation.

if (SDValue Fold = DAG.FoldConstantVectorArithmetic(

@@ -19960,10 +20814,37 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {

(LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {

SDLoc DL(N);

SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0),

- RHS.getOperand(0), N->getFlags());

+ RHS.getOperand(0), Flags);

SDValue UndefV = LHS.getOperand(1);

return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());

}

+ // Try to sink a splat shuffle after a binop with a uniform constant.

+ // This is limited to cases where neither the shuffle nor the constant have

+ // undefined elements because that could be poison-unsafe or inhibit

+ // demanded elements analysis. It is further limited to not change a splat

+ // of an inserted scalar because that may be optimized better by

+ // load-folding or other target-specific behaviors.

+ if (isConstOrConstSplat(RHS) && Shuf0 && is_splat(Shuf0->getMask()) &&

+ Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() &&

+ Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {

+ // binop (splat X), (splat C) --> splat (binop X, C)

+ SDLoc DL(N);

+ SDValue X = Shuf0->getOperand(0);

+ SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags);

+ return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),

+ Shuf0->getMask());

+ }

+ if (isConstOrConstSplat(LHS) && Shuf1 && is_splat(Shuf1->getMask()) &&

+ Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() &&

+ Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {

+ // binop (splat C), (splat X) --> splat (binop C, X)

+ SDLoc DL(N);

+ SDValue X = Shuf1->getOperand(0);

+ SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags);

+ return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),

+ Shuf1->getMask());

+ }

}

// The following pattern is likely to emerge with vector reduction ops. Moving

@@ -20361,8 +21242,8 @@ SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset(

// Create a ConstantArray of the two constants.

Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);

SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),

- TD.getPrefTypeAlignment(FPTy));

- unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();

+ TD.getPrefTypeAlign(FPTy));

+ Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();

// Get offsets to the 0 and 1 elements of the array, so we can select between

// them.

@@ -20797,7 +21678,10 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,

EVT CCVT = getSetCCResultType(VT);

ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;

DenormalMode DenormMode = DAG.getDenormalMode(VT);

- if (DenormMode == DenormalMode::IEEE) {

+ if (DenormMode.Input == DenormalMode::IEEE) {

+ // This is specifically a check for the handling of denormal inputs,

+ // not the result.

// fabs(X) < SmallestNormal ? 0.0 : Est

const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);

APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);

@@ -20849,9 +21733,11 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {

: (LSN->getAddressingMode() == ISD::PRE_DEC)

? -1 * C->getSExtValue()

: 0;

+ uint64_t Size =

+ MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize());

return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(),

Offset /*base offset*/,

- Optional<int64_t>(LSN->getMemoryVT().getStoreSize()),

+ Optional<int64_t>(Size),

LSN->getMemOperand()};

}

if (const auto *LN = cast<LifetimeSDNode>(N))

@@ -20911,21 +21797,24 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {

// If we know required SrcValue1 and SrcValue2 have relatively large

// alignment compared to the size and offset of the access, we may be able

// to prove they do not alias. This check is conservative for now to catch

- // cases created by splitting vector types.

+ // cases created by splitting vector types, it only works when the offsets are

+ // multiples of the size of the data.

int64_t SrcValOffset0 = MUC0.MMO->getOffset();

int64_t SrcValOffset1 = MUC1.MMO->getOffset();

- unsigned OrigAlignment0 = MUC0.MMO->getBaseAlignment();

- unsigned OrigAlignment1 = MUC1.MMO->getBaseAlignment();

+ Align OrigAlignment0 = MUC0.MMO->getBaseAlign();

+ Align OrigAlignment1 = MUC1.MMO->getBaseAlign();

+ auto &Size0 = MUC0.NumBytes;

+ auto &Size1 = MUC1.NumBytes;

if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&

- MUC0.NumBytes.hasValue() && MUC1.NumBytes.hasValue() &&

- *MUC0.NumBytes == *MUC1.NumBytes && OrigAlignment0 > *MUC0.NumBytes) {

- int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0;

- int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1;

+ Size0.hasValue() && Size1.hasValue() && *Size0 == *Size1 &&

+ OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 &&

+ SrcValOffset1 % *Size1 == 0) {

+ int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();

+ int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();

// There is no overlap between these relatively aligned accesses of

// similar size. Return no alias.

- if ((OffAlign0 + *MUC0.NumBytes) <= OffAlign1 ||

- (OffAlign1 + *MUC1.NumBytes) <= OffAlign0)

+ if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0)

return false;

}

@@ -20938,11 +21827,12 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {

UseAA = false;

#endif

- if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue()) {

+ if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&

+ Size0.hasValue() && Size1.hasValue()) {

// Use alias analysis information.

int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);

- int64_t Overlap0 = *MUC0.NumBytes + SrcValOffset0 - MinOffset;

- int64_t Overlap1 = *MUC1.NumBytes + SrcValOffset1 - MinOffset;

+ int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;

+ int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset;

AliasResult AAResult = AA->alias(

MemoryLocation(MUC0.MMO->getValue(), Overlap0,

UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()),

@@ -21099,10 +21989,10 @@ bool operator!=(const UnitT &, const UnitT &) { return false; }

// redundant, as this function gets called when visiting every store

// node, so why not let the work be done on each store as it's visited?

-// I believe this is mainly important because MergeConsecutiveStores

+// I believe this is mainly important because mergeConsecutiveStores

// is unable to deal with merging stores of different sizes, so unless

// we improve the chains of all the potential candidates up-front

-// before running MergeConsecutiveStores, it might only see some of

+// before running mergeConsecutiveStores, it might only see some of

// the nodes that will eventually be candidates, and then not be able

// to go from a partially-merged state to the desired final

// fully-merged state.

@@ -21131,6 +22021,12 @@ bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {

if (BasePtr.getBase().isUndef())

return false;

+ // BaseIndexOffset assumes that offsets are fixed-size, which

+ // is not valid for scalable vectors where the offsets are

+ // scaled by `vscale`, so bail out early.

+ if (St->getMemoryVT().isScalableVector())

+ return false;

// Add ST's interval.

Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit);