diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-02-16 20:13:02 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2021-02-16 20:13:02 +0000 |
commit | b60736ec1405bb0a8dd40989f67ef4c93da068ab (patch) | |
tree | 5c43fbb7c9fc45f0f87e0e6795a86267dbd12f9d /llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | |
parent | cfca06d7963fa0909f90483b42a6d7d194d01e08 (diff) |
Diffstat (limited to 'llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp')
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2383 |
1 files changed, 1488 insertions, 895 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f14b3dba4f31..615bea2a4905 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24,12 +24,14 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -410,9 +412,11 @@ namespace { SDValue visitSUBO(SDNode *N); SDValue visitADDE(SDNode *N); SDValue visitADDCARRY(SDNode *N); + SDValue visitSADDO_CARRY(SDNode *N); SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N); SDValue visitSUBE(SDNode *N); SDValue visitSUBCARRY(SDNode *N); + SDValue visitSSUBO_CARRY(SDNode *N); SDValue visitMUL(SDNode *N); SDValue visitMULFIX(SDNode *N); SDValue useDivRem(SDNode *N); @@ -464,6 +468,7 @@ namespace { SDValue visitFREEZE(SDNode *N); SDValue visitBUILD_PAIR(SDNode *N); SDValue visitFADD(SDNode *N); + SDValue visitSTRICT_FADD(SDNode *N); SDValue visitFSUB(SDNode *N); SDValue visitFMUL(SDNode *N); SDValue visitFMA(SDNode *N); @@ -539,6 +544,7 @@ namespace { SDValue convertSelectOfFPConstantsToLoadOffset( const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC); + SDValue foldSignChangeInBitcast(SDNode *N); SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC); SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, @@ -586,7 +592,7 @@ namespace { const SDLoc &DL); SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue MatchLoadCombine(SDNode *N); - SDValue MatchStoreCombine(StoreSDNode *N); + SDValue mergeTruncStores(StoreSDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); @@ -641,14 +647,18 @@ namespace { // Classify the origin of a stored value. enum class StoreSource { Unknown, Constant, Extract, Load }; StoreSource getStoreSource(SDValue StoreVal) { - if (isa<ConstantSDNode>(StoreVal) || isa<ConstantFPSDNode>(StoreVal)) + switch (StoreVal.getOpcode()) { + case ISD::Constant: + case ISD::ConstantFP: return StoreSource::Constant; - if (StoreVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || - StoreVal.getOpcode() == ISD::EXTRACT_SUBVECTOR) + case ISD::EXTRACT_VECTOR_ELT: + case ISD::EXTRACT_SUBVECTOR: return StoreSource::Extract; - if (isa<LoadSDNode>(StoreVal)) + case ISD::LOAD: return StoreSource::Load; - return StoreSource::Unknown; + default: + return StoreSource::Unknown; + } } /// This is a helper function for visitMUL to check the profitability @@ -752,9 +762,7 @@ namespace { /// is legal or custom before legalizing operations, and whether is /// legal (but not custom) after legalization. bool hasOperation(unsigned Opcode, EVT VT) { - if (LegalOperations) - return TLI.isOperationLegal(Opcode, VT); - return TLI.isOperationLegalOrCustom(Opcode, VT); + return TLI.isOperationLegalOrCustom(Opcode, VT, LegalOperations); } public: @@ -924,23 +932,40 @@ bool DAGCombiner::isOneUseSetCC(SDValue N) const { return false; } -// Returns the SDNode if it is a constant float BuildVector -// or constant float. -static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) { - if (isa<ConstantFPSDNode>(N)) - return N.getNode(); - if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode())) - return N.getNode(); - return nullptr; +static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) { + if (!ScalarTy.isSimple()) + return false; + + uint64_t MaskForTy = 0ULL; + switch (ScalarTy.getSimpleVT().SimpleTy) { + case MVT::i8: + MaskForTy = 0xFFULL; + break; + case MVT::i16: + MaskForTy = 0xFFFFULL; + break; + case MVT::i32: + MaskForTy = 0xFFFFFFFFULL; + break; + default: + return false; + break; + } + + APInt Val; + if (ISD::isConstantSplatVector(N, Val)) + return Val.getLimitedValue() == MaskForTy; + + return false; } -// Determines if it is a constant integer or a build vector of constant +// Determines if it is a constant integer or a splat/build vector of constant // integers (and undefs). // Do not permit build vector implicit truncation. static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) { if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N)) return !(Const->isOpaque() && NoOpaques); - if (N.getOpcode() != ISD::BUILD_VECTOR) + if (N.getOpcode() != ISD::BUILD_VECTOR && N.getOpcode() != ISD::SPLAT_VECTOR) return false; unsigned BitWidth = N.getScalarValueSizeInBits(); for (const SDValue &Op : N->op_values()) { @@ -1554,9 +1579,15 @@ void DAGCombiner::Run(CombineLevel AtLevel) { DAG.ReplaceAllUsesWith(N, &RV); } - // Push the new node and any users onto the worklist - AddToWorklist(RV.getNode()); - AddUsersToWorklist(RV.getNode()); + // Push the new node and any users onto the worklist. Omit this if the + // new node is the EntryToken (e.g. if a store managed to get optimized + // out), because re-visiting the EntryToken and its users will not uncover + // any additional opportunities, but there may be a large number of such + // users, potentially causing compile time explosion. + if (RV.getOpcode() != ISD::EntryToken) { + AddToWorklist(RV.getNode()); + AddUsersToWorklist(RV.getNode()); + } // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to @@ -1589,8 +1620,10 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::USUBO: return visitSUBO(N); case ISD::ADDE: return visitADDE(N); case ISD::ADDCARRY: return visitADDCARRY(N); + case ISD::SADDO_CARRY: return visitSADDO_CARRY(N); case ISD::SUBE: return visitSUBE(N); case ISD::SUBCARRY: return visitSUBCARRY(N); + case ISD::SSUBO_CARRY: return visitSSUBO_CARRY(N); case ISD::SMULFIX: case ISD::SMULFIXSAT: case ISD::UMULFIX: @@ -1646,6 +1679,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::BITCAST: return visitBITCAST(N); case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); case ISD::FADD: return visitFADD(N); + case ISD::STRICT_FADD: return visitSTRICT_FADD(N); case ISD::FSUB: return visitFSUB(N); case ISD::FMUL: return visitFMUL(N); case ISD::FMA: return visitFMA(N); @@ -1805,6 +1839,10 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { if (OptLevel == CodeGenOpt::None) return SDValue(); + // Don't simplify the token factor if the node itself has too many operands. + if (N->getNumOperands() > TokenFactorInlineLimit) + return SDValue(); + // If the sole user is a token factor, we should make sure we have a // chance to merge them together. This prevents TF chains from inhibiting // optimizations. @@ -1890,7 +1928,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) { // If this is an Op, we can remove the op from the list. Remark any // search associated with it as from the current OpNumber. - if (SeenOps.count(Op) != 0) { + if (SeenOps.contains(Op)) { Changed = true; DidPruneOps = true; unsigned OrigOpNumber = 0; @@ -2002,6 +2040,62 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { return Const != nullptr && !Const->isOpaque() ? Const : nullptr; } +/// Return true if 'Use' is a load or a store that uses N as its base pointer +/// and that N may be folded in the load / store addressing mode. +static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG, + const TargetLowering &TLI) { + EVT VT; + unsigned AS; + + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) { + if (LD->isIndexed() || LD->getBasePtr().getNode() != N) + return false; + VT = LD->getMemoryVT(); + AS = LD->getAddressSpace(); + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) { + if (ST->isIndexed() || ST->getBasePtr().getNode() != N) + return false; + VT = ST->getMemoryVT(); + AS = ST->getAddressSpace(); + } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) { + if (LD->isIndexed() || LD->getBasePtr().getNode() != N) + return false; + VT = LD->getMemoryVT(); + AS = LD->getAddressSpace(); + } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) { + if (ST->isIndexed() || ST->getBasePtr().getNode() != N) + return false; + VT = ST->getMemoryVT(); + AS = ST->getAddressSpace(); + } else + return false; + + TargetLowering::AddrMode AM; + if (N->getOpcode() == ISD::ADD) { + AM.HasBaseReg = true; + ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (Offset) + // [reg +/- imm] + AM.BaseOffs = Offset->getSExtValue(); + else + // [reg +/- reg] + AM.Scale = 1; + } else if (N->getOpcode() == ISD::SUB) { + AM.HasBaseReg = true; + ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (Offset) + // [reg +/- imm] + AM.BaseOffs = -Offset->getSExtValue(); + else + // [reg +/- reg] + AM.Scale = 1; + } else + return false; + + return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, + VT.getTypeForEVT(*DAG.getContext()), AS); +} + SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 && "Unexpected binary operator"); @@ -2021,12 +2115,12 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { SDValue CT = Sel.getOperand(1); if (!isConstantOrConstantVector(CT, true) && - !isConstantFPBuildVectorOrConstantFP(CT)) + !DAG.isConstantFPBuildVectorOrConstantFP(CT)) return SDValue(); SDValue CF = Sel.getOperand(2); if (!isConstantOrConstantVector(CF, true) && - !isConstantFPBuildVectorOrConstantFP(CF)) + !DAG.isConstantFPBuildVectorOrConstantFP(CF)) return SDValue(); // Bail out if any constants are opaque because we can't constant fold those. @@ -2043,19 +2137,10 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { SDValue CBO = BO->getOperand(SelOpNo ^ 1); if (!CanFoldNonConst && !isConstantOrConstantVector(CBO, true) && - !isConstantFPBuildVectorOrConstantFP(CBO)) + !DAG.isConstantFPBuildVectorOrConstantFP(CBO)) return SDValue(); - EVT VT = Sel.getValueType(); - - // In case of shift value and shift amount may have different VT. For instance - // on x86 shift amount is i8 regardles of LHS type. Bail out if we have - // swapped operands and value types do not match. NB: x86 is fine if operands - // are not swapped with shift amount VT being not bigger than shifted value. - // TODO: that is possible to check for a shift operation, correct VTs and - // still perform optimization on x86 if needed. - if (SelOpNo && VT != CBO.getValueType()) - return SDValue(); + EVT VT = BO->getValueType(0); // We have a select-of-constants followed by a binary operator with a // constant. Eliminate the binop by pulling the constant math into the select. @@ -2065,14 +2150,14 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { : DAG.getNode(BinOpcode, DL, VT, CT, CBO); if (!CanFoldNonConst && !NewCT.isUndef() && !isConstantOrConstantVector(NewCT, true) && - !isConstantFPBuildVectorOrConstantFP(NewCT)) + !DAG.isConstantFPBuildVectorOrConstantFP(NewCT)) return SDValue(); SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF) : DAG.getNode(BinOpcode, DL, VT, CF, CBO); if (!CanFoldNonConst && !NewCF.isUndef() && !isConstantOrConstantVector(NewCF, true) && - !isConstantFPBuildVectorOrConstantFP(NewCF)) + !DAG.isConstantFPBuildVectorOrConstantFP(NewCF)) return SDValue(); SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); @@ -2402,8 +2487,8 @@ SDValue DAGCombiner::visitADD(SDNode *N) { // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)). if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) { - APInt C0 = N0->getConstantOperandAPInt(0); - APInt C1 = N1->getConstantOperandAPInt(0); + const APInt &C0 = N0->getConstantOperandAPInt(0); + const APInt &C1 = N1->getConstantOperandAPInt(0); return DAG.getVScale(DL, VT, C0 + C1); } @@ -2411,9 +2496,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) { if ((N0.getOpcode() == ISD::ADD) && (N0.getOperand(1).getOpcode() == ISD::VSCALE) && (N1.getOpcode() == ISD::VSCALE)) { - auto VS0 = N0.getOperand(1)->getConstantOperandAPInt(0); - auto VS1 = N1->getConstantOperandAPInt(0); - auto VS = DAG.getVScale(DL, VT, VS0 + VS1); + const APInt &VS0 = N0.getOperand(1)->getConstantOperandAPInt(0); + const APInt &VS1 = N1->getConstantOperandAPInt(0); + SDValue VS = DAG.getVScale(DL, VT, VS0 + VS1); return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS); } @@ -2631,36 +2716,18 @@ SDValue DAGCombiner::visitADDC(SDNode *N) { return SDValue(); } -static SDValue flipBoolean(SDValue V, const SDLoc &DL, - SelectionDAG &DAG, const TargetLowering &TLI) { - EVT VT = V.getValueType(); - - SDValue Cst; - switch (TLI.getBooleanContents(VT)) { - case TargetLowering::ZeroOrOneBooleanContent: - case TargetLowering::UndefinedBooleanContent: - Cst = DAG.getConstant(1, DL, VT); - break; - case TargetLowering::ZeroOrNegativeOneBooleanContent: - Cst = DAG.getAllOnesConstant(DL, VT); - break; - } - - return DAG.getNode(ISD::XOR, DL, VT, V, Cst); -} - /** * Flips a boolean if it is cheaper to compute. If the Force parameters is set, * then the flip also occurs if computing the inverse is the same cost. * This function returns an empty SDValue in case it cannot flip the boolean * without increasing the cost of the computation. If you want to flip a boolean - * no matter what, use flipBoolean. + * no matter what, use DAG.getLogicalNOT. */ static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG, const TargetLowering &TLI, bool Force) { if (Force && isa<ConstantSDNode>(V)) - return flipBoolean(V, SDLoc(V), DAG, TLI); + return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType()); if (V.getOpcode() != ISD::XOR) return SDValue(); @@ -2687,7 +2754,7 @@ static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG, if (IsFlip) return V.getOperand(0); if (Force) - return flipBoolean(V, SDLoc(V), DAG, TLI); + return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType()); return SDValue(); } @@ -2724,8 +2791,8 @@ SDValue DAGCombiner::visitADDO(SDNode *N) { if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) { SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(), DAG.getConstant(0, DL, VT), N0.getOperand(0)); - return CombineTo(N, Sub, - flipBoolean(Sub.getValue(1), DL, DAG, TLI)); + return CombineTo( + N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1))); } if (SDValue Combined = visitUADDOLike(N0, N1, N)) @@ -2820,6 +2887,28 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSADDO_CARRY(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CarryIn = N->getOperand(2); + SDLoc DL(N); + + // canonicalize constant to RHS + ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (N0C && !N1C) + return DAG.getNode(ISD::SADDO_CARRY, DL, N->getVTList(), N1, N0, CarryIn); + + // fold (saddo_carry x, y, false) -> (saddo x, y) + if (isNullConstant(CarryIn)) { + if (!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::SADDO, N->getValueType(0))) + return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, N1); + } + + return SDValue(); +} + /** * If we are facing some sort of diamond carry propapagtion pattern try to * break it up to generate something like: @@ -3005,8 +3094,8 @@ SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDLoc DL(N); SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1, N0.getOperand(0), NotC); - return CombineTo(N, Sub, - flipBoolean(Sub.getValue(1), DL, DAG, TLI)); + return CombineTo( + N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1))); } // Iff the flag result is dead: @@ -3111,6 +3200,13 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { // 0 - X --> X if X is 0 or the minimum signed value. return N1; } + + // Convert 0 - abs(x). + SDValue Result; + if (N1->getOpcode() == ISD::ABS && + !TLI.isOperationLegalOrCustom(ISD::ABS, VT) && + TLI.expandABS(N1.getNode(), Result, DAG, true)) + return Result; } // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) @@ -3306,12 +3402,10 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) { SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1); SDValue S0 = N1.getOperand(0); - if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) { - unsigned OpSizeInBits = VT.getScalarSizeInBits(); + if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) - if (C->getAPIntValue() == (OpSizeInBits - 1)) + if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1)) return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0); - } } } @@ -3342,7 +3436,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { // canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C)) if (N1.getOpcode() == ISD::VSCALE) { - APInt IntVal = N1.getConstantOperandAPInt(0); + const APInt &IntVal = N1.getConstantOperandAPInt(0); return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal)); } @@ -3501,6 +3595,21 @@ SDValue DAGCombiner::visitSUBCARRY(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSSUBO_CARRY(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CarryIn = N->getOperand(2); + + // fold (ssubo_carry x, y, false) -> (ssubo x, y) + if (isNullConstant(CarryIn)) { + if (!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::SSUBO, N->getValueType(0))) + return DAG.getNode(ISD::SSUBO, SDLoc(N), N->getVTList(), N0, N1); + } + + return SDValue(); +} + // Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and // UMULFIXSAT here. SDValue DAGCombiner::visitMULFIX(SDNode *N) { @@ -3606,19 +3715,30 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { getShiftAmountTy(N0.getValueType())))); } - // Try to transform multiply-by-(power-of-2 +/- 1) into shift and add/sub. + // Try to transform: + // (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub. // mul x, (2^N + 1) --> add (shl x, N), x // mul x, (2^N - 1) --> sub (shl x, N), x // Examples: x * 33 --> (x << 5) + x // x * 15 --> (x << 4) - x // x * -33 --> -((x << 5) + x) // x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4) + // (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub. + // mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M)) + // mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M)) + // Examples: x * 0x8800 --> (x << 15) + (x << 11) + // x * 0xf800 --> (x << 16) - (x << 11) + // x * -0x8800 --> -((x << 15) + (x << 11)) + // x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16) if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) { // TODO: We could handle more general decomposition of any constant by // having the target set a limit on number of ops and making a // callback to determine that sequence (similar to sqrt expansion). unsigned MathOp = ISD::DELETED_NODE; APInt MulC = ConstValue1.abs(); + // The constant `2` should be treated as (2^0 + 1). + unsigned TZeros = MulC == 2 ? 0 : MulC.countTrailingZeros(); + MulC.lshrInPlace(TZeros); if ((MulC - 1).isPowerOf2()) MathOp = ISD::ADD; else if ((MulC + 1).isPowerOf2()) @@ -3627,12 +3747,17 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { if (MathOp != ISD::DELETED_NODE) { unsigned ShAmt = MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2(); + ShAmt += TZeros; assert(ShAmt < VT.getScalarSizeInBits() && "multiply-by-constant generated out of bounds shift"); SDLoc DL(N); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT)); - SDValue R = DAG.getNode(MathOp, DL, VT, Shl, N0); + SDValue R = + TZeros ? DAG.getNode(MathOp, DL, VT, Shl, + DAG.getNode(ISD::SHL, DL, VT, N0, + DAG.getConstant(TZeros, DL, VT))) + : DAG.getNode(MathOp, DL, VT, Shl, N0); if (ConstValue1.isNegative()) R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R); return R; @@ -3684,11 +3809,42 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)). if (N0.getOpcode() == ISD::VSCALE) if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) { - APInt C0 = N0.getConstantOperandAPInt(0); - APInt C1 = NC1->getAPIntValue(); + const APInt &C0 = N0.getConstantOperandAPInt(0); + const APInt &C1 = NC1->getAPIntValue(); return DAG.getVScale(SDLoc(N), VT, C0 * C1); } + // Fold ((mul x, 0/undef) -> 0, + // (mul x, 1) -> x) -> x) + // -> and(x, mask) + // We can replace vectors with '0' and '1' factors with a clearing mask. + if (VT.isFixedLengthVector()) { + unsigned NumElts = VT.getVectorNumElements(); + SmallBitVector ClearMask; + ClearMask.reserve(NumElts); + auto IsClearMask = [&ClearMask](ConstantSDNode *V) { + if (!V || V->isNullValue()) { + ClearMask.push_back(true); + return true; + } + ClearMask.push_back(false); + return V->isOne(); + }; + if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::AND, VT)) && + ISD::matchUnaryPredicate(N1, IsClearMask, /*AllowUndefs*/ true)) { + assert(N1.getOpcode() == ISD::BUILD_VECTOR && "Unknown constant vector"); + SDLoc DL(N); + EVT LegalSVT = N1.getOperand(0).getValueType(); + SDValue Zero = DAG.getConstant(0, DL, LegalSVT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, LegalSVT); + SmallVector<SDValue, 16> Mask(NumElts, AllOnes); + for (unsigned I = 0; I != NumElts; ++I) + if (ClearMask[I]) + Mask[I] = Zero; + return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getBuildVector(VT, DL, Mask)); + } + } + // reassociate mul if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) return RMUL; @@ -4108,9 +4264,9 @@ SDValue DAGCombiner::visitREM(SDNode *N) { if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UREM, DL, VT, N0, N1); } else { - SDValue NegOne = DAG.getAllOnesConstant(DL, VT); if (DAG.isKnownToBeAPowerOfTwo(N1)) { // fold (urem x, pow2) -> (and x, pow2-1) + SDValue NegOne = DAG.getAllOnesConstant(DL, VT); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); @@ -4118,6 +4274,7 @@ SDValue DAGCombiner::visitREM(SDNode *N) { if (N1.getOpcode() == ISD::SHL && DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) + SDValue NegOne = DAG.getAllOnesConstant(DL, VT); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); @@ -4186,7 +4343,8 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) { // If the type twice as wide is legal, transform the mulhs to a wider multiply // plus a shift. - if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) { + if (!TLI.isOperationLegalOrCustom(ISD::MULHS, VT) && VT.isSimple() && + !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); @@ -4242,7 +4400,8 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) { // If the type twice as wide is legal, transform the mulhu to a wider multiply // plus a shift. - if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) { + if (!TLI.isOperationLegalOrCustom(ISD::MULHU, VT) && VT.isSimple() && + !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); @@ -4448,6 +4607,10 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1); } + // Simplify the operands using demanded-bits information. + if (SimplifyDemandedBits(SDValue(N, 0))) + return SDValue(N, 0); + return SDValue(); } @@ -4916,8 +5079,15 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, if (!LDST->isSimple()) return false; + EVT LdStMemVT = LDST->getMemoryVT(); + + // Bail out when changing the scalable property, since we can't be sure that + // we're actually narrowing here. + if (LdStMemVT.isScalableVector() != MemVT.isScalableVector()) + return false; + // Verify that we are actually reducing a load width here. - if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits()) + if (LdStMemVT.bitsLT(MemVT)) return false; // Ensure that this isn't going to produce an unsupported memory access. @@ -5272,6 +5442,31 @@ SDValue DAGCombiner::visitAND(SDNode *N) { return N1; if (ISD::isBuildVectorAllOnes(N1.getNode())) return N0; + + // fold (and (masked_load) (build_vec (x, ...))) to zext_masked_load + auto *MLoad = dyn_cast<MaskedLoadSDNode>(N0); + auto *BVec = dyn_cast<BuildVectorSDNode>(N1); + if (MLoad && BVec && MLoad->getExtensionType() == ISD::EXTLOAD && + N0.hasOneUse() && N1.hasOneUse()) { + EVT LoadVT = MLoad->getMemoryVT(); + EVT ExtVT = VT; + if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT, LoadVT)) { + // For this AND to be a zero extension of the masked load the elements + // of the BuildVec must mask the bottom bits of the extended element + // type + if (ConstantSDNode *Splat = BVec->getConstantSplatNode()) { + uint64_t ElementSize = + LoadVT.getVectorElementType().getScalarSizeInBits(); + if (Splat->getAPIntValue().isMask(ElementSize)) { + return DAG.getMaskedLoad( + ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(), + MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(), + LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(), + ISD::ZEXTLOAD, MLoad->isExpandingLoad()); + } + } + } + } } // fold (and c1, c2) -> c1&c2 @@ -5440,6 +5635,28 @@ SDValue DAGCombiner::visitAND(SDNode *N) { } } + // fold (and (masked_gather x)) -> (zext_masked_gather x) + if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) { + EVT MemVT = GN0->getMemoryVT(); + EVT ScalarVT = MemVT.getScalarType(); + + if (SDValue(GN0, 0).hasOneUse() && + isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) && + TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) { + SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(), + GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()}; + + SDValue ZExtLoad = DAG.getMaskedGather( + DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops, + GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD); + + CombineTo(N, ZExtLoad); + AddToWorklist(ZExtLoad.getNode()); + // Avoid recheck of N. + return SDValue(N, 0); + } + } + // fold (and (load x), 255) -> (zextload x, i8) // fold (and (extload x, i16), 255) -> (zextload x, i8) // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8) @@ -5534,6 +5751,31 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (SDValue V = combineShiftAnd1ToBitTest(N, DAG)) return V; + // Recognize the following pattern: + // + // AndVT = (and (sign_extend NarrowVT to AndVT) #bitmask) + // + // where bitmask is a mask that clears the upper bits of AndVT. The + // number of bits in bitmask must be a power of two. + auto IsAndZeroExtMask = [](SDValue LHS, SDValue RHS) { + if (LHS->getOpcode() != ISD::SIGN_EXTEND) + return false; + + auto *C = dyn_cast<ConstantSDNode>(RHS); + if (!C) + return false; + + if (!C->getAPIntValue().isMask( + LHS.getOperand(0).getValueType().getFixedSizeInBits())) + return false; + + return true; + }; + + // Replace (and (sign_extend ...) #bitmask) with (zero_extend ...). + if (IsAndZeroExtMask(N0, N1)) + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0)); + return SDValue(); } @@ -6782,11 +7024,11 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, return None; } -static unsigned LittleEndianByteAt(unsigned BW, unsigned i) { +static unsigned littleEndianByteAt(unsigned BW, unsigned i) { return i; } -static unsigned BigEndianByteAt(unsigned BW, unsigned i) { +static unsigned bigEndianByteAt(unsigned BW, unsigned i) { return BW - i - 1; } @@ -6803,8 +7045,8 @@ static Optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets, bool BigEndian = true, LittleEndian = true; for (unsigned i = 0; i < Width; i++) { int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; - LittleEndian &= CurrentByteOffset == LittleEndianByteAt(Width, i); - BigEndian &= CurrentByteOffset == BigEndianByteAt(Width, i); + LittleEndian &= CurrentByteOffset == littleEndianByteAt(Width, i); + BigEndian &= CurrentByteOffset == bigEndianByteAt(Width, i); if (!BigEndian && !LittleEndian) return None; } @@ -6847,80 +7089,90 @@ static SDValue stripTruncAndExt(SDValue Value) { /// p[3] = (val >> 0) & 0xFF; /// => /// *((i32)p) = BSWAP(val); -SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) { +SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { + // The matching looks for "store (trunc x)" patterns that appear early but are + // likely to be replaced by truncating store nodes during combining. + // TODO: If there is evidence that running this later would help, this + // limitation could be removed. Legality checks may need to be added + // for the created store and optional bswap/rotate. + if (LegalOperations) + return SDValue(); + // Collect all the stores in the chain. SDValue Chain; SmallVector<StoreSDNode *, 8> Stores; for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) { // TODO: Allow unordered atomics when wider type is legal (see D66309) - if (Store->getMemoryVT() != MVT::i8 || + EVT MemVT = Store->getMemoryVT(); + if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) || !Store->isSimple() || Store->isIndexed()) return SDValue(); Stores.push_back(Store); Chain = Store->getChain(); } - // Handle the simple type only. - unsigned Width = Stores.size(); - EVT VT = EVT::getIntegerVT( - *DAG.getContext(), Width * N->getMemoryVT().getSizeInBits()); - if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + // There is no reason to continue if we do not have at least a pair of stores. + if (Stores.size() < 2) return SDValue(); - if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT)) + // Handle simple types only. + LLVMContext &Context = *DAG.getContext(); + unsigned NumStores = Stores.size(); + unsigned NarrowNumBits = N->getMemoryVT().getScalarSizeInBits(); + unsigned WideNumBits = NumStores * NarrowNumBits; + EVT WideVT = EVT::getIntegerVT(Context, WideNumBits); + if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64) return SDValue(); - // Check if all the bytes of the combined value we are looking at are stored - // to the same base address. Collect bytes offsets from Base address into - // ByteOffsets. - SDValue CombinedValue; - SmallVector<int64_t, 8> ByteOffsets(Width, INT64_MAX); + // Check if all bytes of the source value that we are looking at are stored + // to the same base address. Collect offsets from Base address into OffsetMap. + SDValue SourceValue; + SmallVector<int64_t, 8> OffsetMap(NumStores, INT64_MAX); int64_t FirstOffset = INT64_MAX; StoreSDNode *FirstStore = nullptr; Optional<BaseIndexOffset> Base; for (auto Store : Stores) { - // All the stores store different byte of the CombinedValue. A truncate is - // required to get that byte value. + // All the stores store different parts of the CombinedValue. A truncate is + // required to get the partial value. SDValue Trunc = Store->getValue(); if (Trunc.getOpcode() != ISD::TRUNCATE) return SDValue(); - // A shift operation is required to get the right byte offset, except the - // first byte. + // Other than the first/last part, a shift operation is required to get the + // offset. int64_t Offset = 0; - SDValue Value = Trunc.getOperand(0); - if (Value.getOpcode() == ISD::SRL || - Value.getOpcode() == ISD::SRA) { - auto *ShiftOffset = dyn_cast<ConstantSDNode>(Value.getOperand(1)); - // Trying to match the following pattern. The shift offset must be - // a constant and a multiple of 8. It is the byte offset in "y". + SDValue WideVal = Trunc.getOperand(0); + if ((WideVal.getOpcode() == ISD::SRL || WideVal.getOpcode() == ISD::SRA) && + isa<ConstantSDNode>(WideVal.getOperand(1))) { + // The shift amount must be a constant multiple of the narrow type. + // It is translated to the offset address in the wide source value "y". // - // x = srl y, offset + // x = srl y, ShiftAmtC // i8 z = trunc x // store z, ... - if (!ShiftOffset || (ShiftOffset->getSExtValue() % 8)) + uint64_t ShiftAmtC = WideVal.getConstantOperandVal(1); + if (ShiftAmtC % NarrowNumBits != 0) return SDValue(); - Offset = ShiftOffset->getSExtValue()/8; - Value = Value.getOperand(0); + Offset = ShiftAmtC / NarrowNumBits; + WideVal = WideVal.getOperand(0); } - // Stores must share the same combined value with different offsets. - if (!CombinedValue) - CombinedValue = Value; - else if (stripTruncAndExt(CombinedValue) != stripTruncAndExt(Value)) + // Stores must share the same source value with different offsets. + // Truncate and extends should be stripped to get the single source value. + if (!SourceValue) + SourceValue = WideVal; + else if (stripTruncAndExt(SourceValue) != stripTruncAndExt(WideVal)) return SDValue(); - - // The trunc and all the extend operation should be stripped to get the - // real value we are stored. - else if (CombinedValue.getValueType() != VT) { - if (Value.getValueType() == VT || - Value.getValueSizeInBits() > CombinedValue.getValueSizeInBits()) - CombinedValue = Value; - // Give up if the combined value type is smaller than the store size. - if (CombinedValue.getValueSizeInBits() < VT.getSizeInBits()) + else if (SourceValue.getValueType() != WideVT) { + if (WideVal.getValueType() == WideVT || + WideVal.getScalarValueSizeInBits() > + SourceValue.getScalarValueSizeInBits()) + SourceValue = WideVal; + // Give up if the source value type is smaller than the store size. + if (SourceValue.getScalarValueSizeInBits() < WideVT.getScalarSizeInBits()) return SDValue(); } - // Stores must share the same base address + // Stores must share the same base address. BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG); int64_t ByteOffsetFromBase = 0; if (!Base) @@ -6928,60 +7180,78 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) { else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) return SDValue(); - // Remember the first byte store + // Remember the first store. if (ByteOffsetFromBase < FirstOffset) { FirstStore = Store; FirstOffset = ByteOffsetFromBase; } // Map the offset in the store and the offset in the combined value, and // early return if it has been set before. - if (Offset < 0 || Offset >= Width || ByteOffsets[Offset] != INT64_MAX) + if (Offset < 0 || Offset >= NumStores || OffsetMap[Offset] != INT64_MAX) return SDValue(); - ByteOffsets[Offset] = ByteOffsetFromBase; + OffsetMap[Offset] = ByteOffsetFromBase; } assert(FirstOffset != INT64_MAX && "First byte offset must be set"); assert(FirstStore && "First store must be set"); - // Check if the bytes of the combined value we are looking at match with - // either big or little endian value store. - Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset); - if (!IsBigEndian.hasValue()) - return SDValue(); - - // The node we are looking at matches with the pattern, check if we can - // replace it with a single bswap if needed and store. - - // If the store needs byte swap check if the target supports it - bool NeedsBswap = DAG.getDataLayout().isBigEndian() != *IsBigEndian; - - // Before legalize we can introduce illegal bswaps which will be later - // converted to an explicit bswap sequence. This way we end up with a single - // store and byte shuffling instead of several stores and byte shuffling. - if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) - return SDValue(); - // Check that a store of the wide type is both allowed and fast on the target + const DataLayout &Layout = DAG.getDataLayout(); bool Fast = false; - bool Allowed = - TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - *FirstStore->getMemOperand(), &Fast); + bool Allowed = TLI.allowsMemoryAccess(Context, Layout, WideVT, + *FirstStore->getMemOperand(), &Fast); if (!Allowed || !Fast) return SDValue(); - if (VT != CombinedValue.getValueType()) { - assert(CombinedValue.getValueType().getSizeInBits() > VT.getSizeInBits() && - "Get unexpected store value to combine"); - CombinedValue = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, - CombinedValue); + // Check if the pieces of the value are going to the expected places in memory + // to merge the stores. + auto checkOffsets = [&](bool MatchLittleEndian) { + if (MatchLittleEndian) { + for (unsigned i = 0; i != NumStores; ++i) + if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset) + return false; + } else { // MatchBigEndian by reversing loop counter. + for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j) + if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset) + return false; + } + return true; + }; + + // Check if the offsets line up for the native data layout of this target. + bool NeedBswap = false; + bool NeedRotate = false; + if (!checkOffsets(Layout.isLittleEndian())) { + // Special-case: check if byte offsets line up for the opposite endian. + if (NarrowNumBits == 8 && checkOffsets(Layout.isBigEndian())) + NeedBswap = true; + else if (NumStores == 2 && checkOffsets(Layout.isBigEndian())) + NeedRotate = true; + else + return SDValue(); + } + + SDLoc DL(N); + if (WideVT != SourceValue.getValueType()) { + assert(SourceValue.getValueType().getScalarSizeInBits() > WideNumBits && + "Unexpected store value to merge"); + SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue); } - if (NeedsBswap) - CombinedValue = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, CombinedValue); + // Before legalize we can introduce illegal bswaps/rotates which will be later + // converted to an explicit bswap sequence. This way we end up with a single + // store and byte shuffling instead of several stores and byte shuffling. + if (NeedBswap) { + SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue); + } else if (NeedRotate) { + assert(WideNumBits % 2 == 0 && "Unexpected type for rotate"); + SDValue RotAmt = DAG.getConstant(WideNumBits / 2, DL, WideVT); + SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt); + } SDValue NewStore = - DAG.getStore(Chain, SDLoc(N), CombinedValue, FirstStore->getBasePtr(), - FirstStore->getPointerInfo(), FirstStore->getAlignment()); + DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(), + FirstStore->getPointerInfo(), FirstStore->getAlign()); // Rely on other DAG combine rules to remove the other individual stores. DAG.ReplaceAllUsesWith(N, NewStore.getNode()); @@ -7036,8 +7306,8 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { "can only analyze providers for individual bytes not bit"); unsigned LoadByteWidth = LoadBitWidth / 8; return IsBigEndianTarget - ? BigEndianByteAt(LoadByteWidth, P.ByteOffset) - : LittleEndianByteAt(LoadByteWidth, P.ByteOffset); + ? bigEndianByteAt(LoadByteWidth, P.ByteOffset) + : littleEndianByteAt(LoadByteWidth, P.ByteOffset); }; Optional<BaseIndexOffset> Base; @@ -7164,10 +7434,10 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { if (!Allowed || !Fast) return SDValue(); - SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, - SDLoc(N), VT, Chain, FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), MemVT, - FirstLoad->getAlignment()); + SDValue NewLoad = + DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT, + Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign()); // Transfer chain users from old loads to the new load. for (LoadSDNode *L : Loads) @@ -7337,9 +7607,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { if (N0.hasOneUse()) { // FIXME Can we handle multiple uses? Could we token factor the chain // results from the new/old setcc? - SDValue SetCC = DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, - N0.getOperand(0), - N0Opcode == ISD::STRICT_FSETCCS); + SDValue SetCC = + DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, + N0.getOperand(0), N0Opcode == ISD::STRICT_FSETCCS); CombineTo(N, SetCC); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1)); recursivelyDeleteUnusedNodes(N0.getNode()); @@ -7440,12 +7710,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) { SDValue A0 = A.getOperand(0), A1 = A.getOperand(1); SDValue S0 = S.getOperand(0); - if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0)) { - unsigned OpSizeInBits = VT.getScalarSizeInBits(); + if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0)) if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1))) - if (C->getAPIntValue() == (OpSizeInBits - 1)) + if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1)) return DAG.getNode(ISD::ABS, DL, VT, S0); - } } } @@ -7980,10 +8248,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)). if (N0.getOpcode() == ISD::VSCALE) if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) { - auto DL = SDLoc(N); - APInt C0 = N0.getConstantOperandAPInt(0); - APInt C1 = NC1->getAPIntValue(); - return DAG.getVScale(DL, VT, C0 << C1); + const APInt &C0 = N0.getConstantOperandAPInt(0); + const APInt &C1 = NC1->getAPIntValue(); + return DAG.getVScale(SDLoc(N), VT, C0 << C1); } return SDValue(); @@ -8032,12 +8299,6 @@ static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG, if (NarrowVT != RightOp.getOperand(0).getValueType()) return SDValue(); - // Only transform into mulh if mulh for the narrow type is cheaper than - // a multiply followed by a shift. This should also check if mulh is - // legal for NarrowVT on the target. - if (!TLI.isMulhCheaperThanMulShift(NarrowVT)) - return SDValue(); - // Proceed with the transformation if the wide type is twice as large // as the narrow type. unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits(); @@ -8055,6 +8316,10 @@ static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG, // we use mulhs. Othewise, zero extends (zext) use mulhu. unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU; + // Combine to mulh if mulh is legal/custom for the narrow type on the target. + if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT)) + return SDValue(); + SDValue Result = DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0), RightOp.getOperand(0)); return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT1) @@ -8556,8 +8821,8 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) { RHS->getAddressSpace(), NewAlign, RHS->getMemOperand()->getFlags(), &Fast) && Fast) { - SDValue NewPtr = - DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL); + SDValue NewPtr = DAG.getMemBasePlusOffset( + RHS->getBasePtr(), TypeSize::Fixed(PtrOff), DL); AddToWorklist(NewPtr.getNode()); SDValue Load = DAG.getLoad( VT, DL, RHS->getChain(), NewPtr, @@ -9154,16 +9419,75 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1)); } +bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) { + if (!isNullConstant(BasePtr) || Index.getOpcode() != ISD::ADD) + return false; + + // For now we check only the LHS of the add. + SDValue LHS = Index.getOperand(0); + SDValue SplatVal = DAG.getSplatValue(LHS); + if (!SplatVal) + return false; + + BasePtr = SplatVal; + Index = Index.getOperand(1); + return true; +} + +// Fold sext/zext of index into index type. +bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index, + bool Scaled, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (Index.getOpcode() == ISD::ZERO_EXTEND) { + SDValue Op = Index.getOperand(0); + MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED); + if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) { + Index = Op; + return true; + } + } + + if (Index.getOpcode() == ISD::SIGN_EXTEND) { + SDValue Op = Index.getOperand(0); + MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED); + if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) { + Index = Op; + return true; + } + } + + return false; +} + SDValue DAGCombiner::visitMSCATTER(SDNode *N) { MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N); SDValue Mask = MSC->getMask(); SDValue Chain = MSC->getChain(); + SDValue Index = MSC->getIndex(); + SDValue Scale = MSC->getScale(); + SDValue StoreVal = MSC->getValue(); + SDValue BasePtr = MSC->getBasePtr(); SDLoc DL(N); // Zap scatters with a zero mask. if (ISD::isBuildVectorAllZeros(Mask.getNode())) return Chain; + if (refineUniformBase(BasePtr, Index, DAG)) { + SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedScatter( + DAG.getVTList(MVT::Other), StoreVal.getValueType(), DL, Ops, + MSC->getMemOperand(), MSC->getIndexType(), MSC->isTruncatingStore()); + } + + if (refineIndexType(MSC, Index, MSC->isIndexScaled(), DAG)) { + SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedScatter( + DAG.getVTList(MVT::Other), StoreVal.getValueType(), DL, Ops, + MSC->getMemOperand(), MSC->getIndexType(), MSC->isTruncatingStore()); + } + return SDValue(); } @@ -9177,6 +9501,14 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { if (ISD::isBuildVectorAllZeros(Mask.getNode())) return Chain; + // If this is a masked load with an all ones mask, we can use a unmasked load. + // FIXME: Can we do this for indexed, compressing, or truncating stores? + if (ISD::isBuildVectorAllOnes(Mask.getNode()) && + MST->isUnindexed() && !MST->isCompressingStore() && + !MST->isTruncatingStore()) + return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(), + MST->getBasePtr(), MST->getMemOperand()); + // Try transforming N to an indexed store. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); @@ -9187,11 +9519,32 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { SDValue DAGCombiner::visitMGATHER(SDNode *N) { MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N); SDValue Mask = MGT->getMask(); + SDValue Chain = MGT->getChain(); + SDValue Index = MGT->getIndex(); + SDValue Scale = MGT->getScale(); + SDValue PassThru = MGT->getPassThru(); + SDValue BasePtr = MGT->getBasePtr(); SDLoc DL(N); // Zap gathers with a zero mask. if (ISD::isBuildVectorAllZeros(Mask.getNode())) - return CombineTo(N, MGT->getPassThru(), MGT->getChain()); + return CombineTo(N, PassThru, MGT->getChain()); + + if (refineUniformBase(BasePtr, Index, DAG)) { + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), + PassThru.getValueType(), DL, Ops, + MGT->getMemOperand(), MGT->getIndexType(), + MGT->getExtensionType()); + } + + if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) { + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), + PassThru.getValueType(), DL, Ops, + MGT->getMemOperand(), MGT->getIndexType(), + MGT->getExtensionType()); + } return SDValue(); } @@ -9205,6 +9558,16 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { if (ISD::isBuildVectorAllZeros(Mask.getNode())) return CombineTo(N, MLD->getPassThru(), MLD->getChain()); + // If this is a masked load with an all ones mask, we can use a unmasked load. + // FIXME: Can we do this for indexed, expanding, or extending loads? + if (ISD::isBuildVectorAllOnes(Mask.getNode()) && + MLD->isUnindexed() && !MLD->isExpandingLoad() && + MLD->getExtensionType() == ISD::NON_EXTLOAD) { + SDValue NewLd = DAG.getLoad(N->getValueType(0), SDLoc(N), MLD->getChain(), + MLD->getBasePtr(), MLD->getMemOperand()); + return CombineTo(N, NewLd, NewLd.getValue(1)); + } + // Try transforming N to an indexed load. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); @@ -9364,6 +9727,113 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2); } } + + // Match VSELECTs into add with unsigned saturation. + if (hasOperation(ISD::UADDSAT, VT)) { + // Check if one of the arms of the VSELECT is vector with all bits set. + // If it's on the left side invert the predicate to simplify logic below. + SDValue Other; + ISD::CondCode SatCC = CC; + if (ISD::isBuildVectorAllOnes(N1.getNode())) { + Other = N2; + SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType()); + } else if (ISD::isBuildVectorAllOnes(N2.getNode())) { + Other = N1; + } + + if (Other && Other.getOpcode() == ISD::ADD) { + SDValue CondLHS = LHS, CondRHS = RHS; + SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); + + // Canonicalize condition operands. + if (SatCC == ISD::SETUGE) { + std::swap(CondLHS, CondRHS); + SatCC = ISD::SETULE; + } + + // We can test against either of the addition operands. + // x <= x+y ? x+y : ~0 --> uaddsat x, y + // x+y >= x ? x+y : ~0 --> uaddsat x, y + if (SatCC == ISD::SETULE && Other == CondRHS && + (OpLHS == CondLHS || OpRHS == CondLHS)) + return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); + + if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) && + CondLHS == OpLHS) { + // If the RHS is a constant we have to reverse the const + // canonicalization. + // x >= ~C ? x+C : ~0 --> uaddsat x, C + auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { + return Cond->getAPIntValue() == ~Op->getAPIntValue(); + }; + if (SatCC == ISD::SETULE && + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT)) + return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); + } + } + } + + // Match VSELECTs into sub with unsigned saturation. + if (hasOperation(ISD::USUBSAT, VT)) { + // Check if one of the arms of the VSELECT is a zero vector. If it's on + // the left side invert the predicate to simplify logic below. + SDValue Other; + ISD::CondCode SatCC = CC; + if (ISD::isBuildVectorAllZeros(N1.getNode())) { + Other = N2; + SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType()); + } else if (ISD::isBuildVectorAllZeros(N2.getNode())) { + Other = N1; + } + + if (Other && Other.getNumOperands() == 2 && Other.getOperand(0) == LHS) { + SDValue CondRHS = RHS; + SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); + + // Look for a general sub with unsigned saturation first. + // x >= y ? x-y : 0 --> usubsat x, y + // x > y ? x-y : 0 --> usubsat x, y + if ((SatCC == ISD::SETUGE || SatCC == ISD::SETUGT) && + Other.getOpcode() == ISD::SUB && OpRHS == CondRHS) + return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); + + if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) { + if (isa<BuildVectorSDNode>(CondRHS)) { + // If the RHS is a constant we have to reverse the const + // canonicalization. + // x > C-1 ? x+-C : 0 --> usubsat x, C + auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { + return (!Op && !Cond) || + (Op && Cond && + Cond->getAPIntValue() == (-Op->getAPIntValue() - 1)); + }; + if (SatCC == ISD::SETUGT && Other.getOpcode() == ISD::ADD && + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT, + /*AllowUndefs*/ true)) { + OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + OpRHS); + return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); + } + + // Another special case: If C was a sign bit, the sub has been + // canonicalized into a xor. + // FIXME: Would it be better to use computeKnownBits to determine + // whether it's safe to decanonicalize the xor? + // x s< 0 ? x^C : 0 --> usubsat x, C + if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { + if (SatCC == ISD::SETLT && Other.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllZeros(CondRHS.getNode()) && + OpRHSConst->getAPIntValue().isSignMask()) { + // Note that we have to rebuild the RHS constant here to ensure + // we don't rely on particular values of undef lanes. + OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT); + return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); + } + } + } + } + } + } } if (SimplifySelectOps(N, N1, N2)) @@ -9722,14 +10192,14 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { SDValue BasePtr = LN0->getBasePtr(); for (unsigned Idx = 0; Idx < NumSplits; Idx++) { const unsigned Offset = Idx * Stride; - const unsigned Align = MinAlign(LN0->getAlignment(), Offset); + const Align Align = commonAlignment(LN0->getAlign(), Offset); SDValue SplitLoad = DAG.getExtLoad( ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr, LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align, LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); - BasePtr = DAG.getMemBasePlusOffset(BasePtr, Stride, DL); + BasePtr = DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(Stride), DL); Loads.push_back(SplitLoad.getValue(0)); Chains.push_back(SplitLoad.getValue(1)); @@ -10146,7 +10616,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); - EVT N00VT = N0.getOperand(0).getValueType(); + EVT N00VT = N00.getValueType(); // sext(setcc) -> sext_in_reg(vsetcc) for vectors. // Only do this before legalize for now. @@ -10240,6 +10710,29 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT)); } + // fold sext (not i1 X) -> add (zext i1 X), -1 + // TODO: This could be extended to handle bool vectors. + if (N0.getValueType() == MVT::i1 && isBitwiseNot(N0) && N0.hasOneUse() && + (!LegalOperations || (TLI.isOperationLegal(ISD::ZERO_EXTEND, VT) && + TLI.isOperationLegal(ISD::ADD, VT)))) { + // If we can eliminate the 'not', the sext form should be better + if (SDValue NewXor = visitXOR(N0.getNode())) { + // Returning N0 is a form of in-visit replacement that may have + // invalidated N0. + if (NewXor.getNode() == N0.getNode()) { + // Return SDValue here as the xor should have already been replaced in + // this sext. + return SDValue(); + } else { + // Return a new sext with the new xor. + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewXor); + } + } + + SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); + return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT)); + } + return SDValue(); } @@ -10507,13 +11000,16 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { N0.getValueType()); } - // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc + // zext(setcc x,y,cc) -> zext(select x, y, true, false, cc) SDLoc DL(N); + EVT N0VT = N0.getValueType(); + EVT N00VT = N0.getOperand(0).getValueType(); if (SDValue SCC = SimplifySelectCC( - DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), - DAG.getConstant(0, DL, VT), + DL, N0.getOperand(0), N0.getOperand(1), + DAG.getBoolConstant(true, DL, N0VT, N00VT), + DAG.getBoolConstant(false, DL, N0VT, N00VT), cast<CondCodeSDNode>(N0.getOperand(2))->get(), true)) - return SCC; + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, SCC); } // (zext (shl (zext x), cst)) -> (shl (zext x), cst) @@ -10602,22 +11098,26 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { // fold (aext (load x)) -> (aext (truncate (extload x))) // None of the supported targets knows how to perform load and any_ext - // on vectors in one instruction. We only perform this transformation on - // scalars. - if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && - ISD::isUNINDEXEDLoad(N0.getNode()) && - TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { + // on vectors in one instruction, so attempt to fold to zext instead. + if (VT.isVector()) { + // Try to simplify (zext (load x)). + if (SDValue foldedExt = + tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, + ISD::ZEXTLOAD, ISD::ZERO_EXTEND)) + return foldedExt; + } else if (ISD::isNON_EXTLoad(N0.getNode()) && + ISD::isUNINDEXEDLoad(N0.getNode()) && + TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { bool DoXform = true; - SmallVector<SDNode*, 4> SetCCs; + SmallVector<SDNode *, 4> SetCCs; if (!N0.hasOneUse()) - DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, - TLI); + DoXform = + ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI); if (DoXform) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, - LN0->getChain(), - LN0->getBasePtr(), N0.getValueType(), - LN0->getMemOperand()); + LN0->getChain(), LN0->getBasePtr(), + N0.getValueType(), LN0->getMemOperand()); ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND); // If the load value is used only by N, replace it via CombineTo N. bool NoReplaceTrunc = N0.hasOneUse(); @@ -10626,8 +11126,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); recursivelyDeleteUnusedNodes(LN0); } else { - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), - N0.getValueType(), ExtLoad); + SDValue Trunc = + DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); CombineTo(LN0, Trunc, ExtLoad.getValue(1)); } return SDValue(N, 0); // Return N so it doesn't get rechecked! @@ -10832,12 +11332,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { return SDValue(); uint64_t ShiftAmt = N01->getZExtValue(); - uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits(); + uint64_t MemoryWidth = LN0->getMemoryVT().getScalarSizeInBits(); if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt) ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt); else ExtVT = EVT::getIntegerVT(*DAG.getContext(), - VT.getSizeInBits() - ShiftAmt); + VT.getScalarSizeInBits() - ShiftAmt); } else if (Opc == ISD::AND) { // An AND with a constant mask is the same as a truncate + zero-extend. auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1)); @@ -10864,12 +11364,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { SDValue SRL = N0; if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) { ShAmt = ConstShift->getZExtValue(); - unsigned EVTBits = ExtVT.getSizeInBits(); + unsigned EVTBits = ExtVT.getScalarSizeInBits(); // Is the shift amount a multiple of size of VT? if ((ShAmt & (EVTBits-1)) == 0) { N0 = N0.getOperand(0); // Is the load width a multiple of size of VT? - if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0) + if ((N0.getScalarValueSizeInBits() & (EVTBits - 1)) != 0) return SDValue(); } @@ -10899,7 +11399,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), ShiftMask.countTrailingOnes()); // If the mask is smaller, recompute the type. - if ((ExtVT.getSizeInBits() > MaskedVT.getSizeInBits()) && + if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) && TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT)) ExtVT = MaskedVT; } @@ -10930,8 +11430,9 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { return SDValue(); auto AdjustBigEndianShift = [&](unsigned ShAmt) { - unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits(); - unsigned EVTStoreBits = ExtVT.getStoreSizeInBits(); + unsigned LVTStoreBits = + LN0->getMemoryVT().getStoreSizeInBits().getFixedSize(); + unsigned EVTStoreBits = ExtVT.getStoreSizeInBits().getFixedSize(); return LVTStoreBits - EVTStoreBits - ShAmt; }; @@ -10941,13 +11442,13 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { ShAmt = AdjustBigEndianShift(ShAmt); uint64_t PtrOff = ShAmt / 8; - unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); + Align NewAlign = commonAlignment(LN0->getAlign(), PtrOff); SDLoc DL(LN0); // The original load itself didn't wrap, so an offset within it doesn't. SDNodeFlags Flags; Flags.setNoUnsignedWrap(true); - SDValue NewPtr = - DAG.getMemBasePlusOffset(LN0->getBasePtr(), PtrOff, DL, Flags); + SDValue NewPtr = DAG.getMemBasePlusOffset(LN0->getBasePtr(), + TypeSize::Fixed(PtrOff), DL, Flags); AddToWorklist(NewPtr.getNode()); SDValue Load; @@ -10969,13 +11470,13 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { SDValue Result = Load; if (ShLeftAmt != 0) { EVT ShImmTy = getShiftAmountTy(Result.getValueType()); - if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt)) + if (!isUIntN(ShImmTy.getScalarSizeInBits(), ShLeftAmt)) ShImmTy = VT; // If the shift amount is as large as the result size (but, presumably, // no larger than the source) then the useful bits of the result are // zero; we can't simply return the shortened shift, because the result // of that operation is undefined. - if (ShLeftAmt >= VT.getSizeInBits()) + if (ShLeftAmt >= VT.getScalarSizeInBits()) Result = DAG.getConstant(0, DL, VT); else Result = DAG.getNode(ISD::SHL, DL, VT, @@ -11125,6 +11626,41 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { return SDValue(N, 0); // Return N so it doesn't get rechecked! } + // fold (sext_inreg (masked_load x)) -> (sext_masked_load x) + // ignore it if the masked load is already sign extended + if (MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0)) { + if (ExtVT == Ld->getMemoryVT() && N0.hasOneUse() && + Ld->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD && + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) { + SDValue ExtMaskedLoad = DAG.getMaskedLoad( + VT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), + Ld->getMask(), Ld->getPassThru(), ExtVT, Ld->getMemOperand(), + Ld->getAddressingMode(), ISD::SEXTLOAD, Ld->isExpandingLoad()); + CombineTo(N, ExtMaskedLoad); + CombineTo(N0.getNode(), ExtMaskedLoad, ExtMaskedLoad.getValue(1)); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + } + + // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x) + if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) { + if (SDValue(GN0, 0).hasOneUse() && + ExtVT == GN0->getMemoryVT() && + TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) { + SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(), + GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()}; + + SDValue ExtLoad = DAG.getMaskedGather( + DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops, + GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD); + + CombineTo(N, ExtLoad); + CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); + AddToWorklist(ExtLoad.getNode()); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + } + // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16)) if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) { if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), @@ -11225,10 +11761,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { EVT ExTy = N0.getValueType(); EVT TrTy = N->getValueType(0); - unsigned NumElem = VecTy.getVectorNumElements(); + auto EltCnt = VecTy.getVectorElementCount(); unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits(); + auto NewEltCnt = EltCnt * SizeRatio; - EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem); + EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, NewEltCnt); assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size"); SDValue EltNo = N0->getOperand(1); @@ -11342,8 +11879,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // after truncation. if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); - if (LN0->isSimple() && - LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) { + if (LN0->isSimple() && LN0->getMemoryVT().bitsLT(VT)) { SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), LN0->getMemoryVT(), @@ -11372,9 +11908,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // Stop if more than one members are non-undef. if (NumDefs > 1) break; + VTs.push_back(EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - X.getValueType().getVectorNumElements())); + X.getValueType().getVectorElementCount())); } if (NumDefs == 0) @@ -11415,8 +11952,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } // Simplify the operands using demanded-bits information. - if (!VT.isVector() && - SimplifyDemandedBits(SDValue(N, 0))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) @@ -11643,7 +12179,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { *LN0->getMemOperand())) { SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), - LN0->getPointerInfo(), LN0->getAlignment(), + LN0->getPointerInfo(), LN0->getAlign(), LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); return Load; @@ -11990,7 +12526,6 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (!HasFMAD && !HasFMA) return SDValue(); - SDNodeFlags Flags = N->getFlags(); bool CanFuse = Options.UnsafeFPMath || isContractable(N); bool CanReassociate = Options.UnsafeFPMath || N->getFlags().hasAllowReassociation(); @@ -12023,15 +12558,15 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), N1, Flags); + return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), + N0.getOperand(1), N1); } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N1.getOperand(0), N1.getOperand(1), N0, Flags); + return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), + N1.getOperand(1), N0); } // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E) @@ -12054,8 +12589,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue B = FMA.getOperand(1); SDValue C = FMA.getOperand(2).getOperand(0); SDValue D = FMA.getOperand(2).getOperand(1); - SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E, Flags); - return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE, Flags); + SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E); + return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE); } // Look through FP_EXTEND nodes to do more combining. @@ -12067,10 +12602,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(1)), N1, Flags); + DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), + N1); } } @@ -12082,10 +12616,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N10.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(1)), N0, Flags); + DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), + N0); } } @@ -12093,14 +12626,13 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (Aggressive) { // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) - auto FoldFAddFMAFPExtFMul = [&] ( - SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, - SDNodeFlags Flags) { + auto FoldFAddFMAFPExtFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V, + SDValue Z) { return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, U), DAG.getNode(ISD::FP_EXTEND, SL, VT, V), - Z, Flags), Flags); + Z)); }; if (N0.getOpcode() == PreferredFusedOpcode) { SDValue N02 = N0.getOperand(2); @@ -12111,7 +12643,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { N020.getValueType())) { return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), N020.getOperand(0), N020.getOperand(1), - N1, Flags); + N1); } } } @@ -12121,16 +12653,14 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. - auto FoldFAddFPExtFMAFMul = [&] ( - SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, - SDNodeFlags Flags) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, X), - DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, U), - DAG.getNode(ISD::FP_EXTEND, SL, VT, V), - Z, Flags), Flags); + auto FoldFAddFPExtFMAFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V, + SDValue Z) { + return DAG.getNode( + PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, X), + DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, U), + DAG.getNode(ISD::FP_EXTEND, SL, VT, V), Z)); }; if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); @@ -12141,7 +12671,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { N00.getValueType())) { return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), N002.getOperand(0), N002.getOperand(1), - N1, Flags); + N1); } } } @@ -12157,7 +12687,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { N120.getValueType())) { return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), N120.getOperand(0), N120.getOperand(1), - N0, Flags); + N0); } } } @@ -12176,7 +12706,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { N10.getValueType())) { return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), N102.getOperand(0), N102.getOperand(1), - N0, Flags); + N0); } } } @@ -12234,8 +12764,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) { if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0), - XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z), - Flags); + XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z)); } return SDValue(); }; @@ -12246,7 +12775,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)), - YZ.getOperand(1), X, Flags); + YZ.getOperand(1), X); } return SDValue(); }; @@ -12277,7 +12806,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N01 = N0.getOperand(0).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N00), N01, - DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); + DAG.getNode(ISD::FNEG, SL, VT, N1)); } // Look through FP_EXTEND nodes to do more combining. @@ -12290,11 +12819,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); + DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, N1)); } } @@ -12306,13 +12833,11 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (isContractableFMUL(N10) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N10.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(0))), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N10.getOperand(1)), - N0, Flags); + return DAG.getNode( + PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0))), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0); } } @@ -12329,13 +12854,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (isContractableFMUL(N000) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { - return DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N000.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N000.getOperand(1)), - N1, Flags)); + return DAG.getNode( + ISD::FNEG, SL, VT, + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), + N1)); } } } @@ -12353,13 +12877,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (isContractableFMUL(N000) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N000.getValueType())) { - return DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N000.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N000.getOperand(1)), - N1, Flags)); + return DAG.getNode( + ISD::FNEG, SL, VT, + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), + N1)); } } } @@ -12371,13 +12894,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), + return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), + N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), - DAG.getNode(ISD::FNEG, SL, VT, - N1), Flags), Flags); + DAG.getNode(ISD::FNEG, SL, VT, N1))); } // fold (fsub x, (fma y, z, (fmul u, v))) @@ -12387,13 +12909,11 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { N1->hasOneUse() && NoSignedZero) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - N1.getOperand(0)), - N1.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, N20), - N21, N0, Flags), Flags); + return DAG.getNode( + PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N20), N21, N0)); } @@ -12407,15 +12927,13 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (isContractableFMUL(N020) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N020.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(0), N0.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N020.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N020.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, - N1), Flags), Flags); + return DAG.getNode( + PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), + DAG.getNode( + PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, N1))); } } } @@ -12433,18 +12951,15 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (isContractableFMUL(N002) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N00.getOperand(1)), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N002.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N002.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, - N1), Flags), Flags); + return DAG.getNode( + PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), + DAG.getNode( + PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(0)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)), + DAG.getNode(ISD::FNEG, SL, VT, N1))); } } } @@ -12460,16 +12975,13 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { N120.getValueType())) { SDValue N1200 = N120.getOperand(0); SDValue N1201 = N120.getOperand(1); - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), - N1.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, - VT, N1200)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N1201), - N0, Flags), Flags); + return DAG.getNode( + PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, N1200)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N1201), N0)); } } @@ -12490,18 +13002,15 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { CvtSrc.getValueType())) { SDValue N1020 = N102.getOperand(0); SDValue N1021 = N102.getOperand(1); - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N100)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, - VT, N1020)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, - N1021), - N0, Flags), Flags); + return DAG.getNode( + PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, N100)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), + DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, + DAG.getNode(ISD::FP_EXTEND, SL, VT, N1020)), + DAG.getNode(ISD::FP_EXTEND, SL, VT, N1021), N0)); } } } @@ -12517,7 +13026,6 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); - const SDNodeFlags Flags = N->getFlags(); assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); @@ -12549,56 +13057,56 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y) // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y)) - auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { + auto FuseFADD = [&](SDValue X, SDValue Y) { if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) { if (C->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - Y, Flags); + Y); if (C->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); + DAG.getNode(ISD::FNEG, SL, VT, Y)); } } return SDValue(); }; - if (SDValue FMA = FuseFADD(N0, N1, Flags)) + if (SDValue FMA = FuseFADD(N0, N1)) return FMA; - if (SDValue FMA = FuseFADD(N1, N0, Flags)) + if (SDValue FMA = FuseFADD(N1, N0)) return FMA; // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y) // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y)) // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y)) // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y) - auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { + auto FuseFSUB = [&](SDValue X, SDValue Y) { if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) { if (C0->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, - Y, Flags); + Y); if (C0->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); + DAG.getNode(ISD::FNEG, SL, VT, Y)); } if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) { if (C1->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); + DAG.getNode(ISD::FNEG, SL, VT, Y)); if (C1->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - Y, Flags); + Y); } } return SDValue(); }; - if (SDValue FMA = FuseFSUB(N0, N1, Flags)) + if (SDValue FMA = FuseFSUB(N0, N1)) return FMA; - if (SDValue FMA = FuseFSUB(N1, N0, Flags)) + if (SDValue FMA = FuseFSUB(N1, N0)) return FMA; return SDValue(); @@ -12607,12 +13115,13 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0); - bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); + bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); + bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - const SDNodeFlags Flags = N->getFlags(); + SDNodeFlags Flags = N->getFlags(); + SelectionDAG::FlagInserter FlagsInserter(DAG, N); if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) return R; @@ -12624,11 +13133,11 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // fold (fadd c1, c2) -> c1 + c2 if (N0CFP && N1CFP) - return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags); + return DAG.getNode(ISD::FADD, DL, VT, N0, N1); // canonicalize constant to RHS if (N0CFP && !N1CFP) - return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags); + return DAG.getNode(ISD::FADD, DL, VT, N1, N0); // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math) ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true); @@ -12643,13 +13152,13 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) if (SDValue NegN1 = TLI.getCheaperNegatedExpression( N1, DAG, LegalOperations, ForCodeSize)) - return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1, Flags); + return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1); // fold (fadd (fneg A), B) -> (fsub B, A) if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) if (SDValue NegN0 = TLI.getCheaperNegatedExpression( N0, DAG, LegalOperations, ForCodeSize)) - return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0, Flags); + return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0); auto isFMulNegTwo = [](SDValue FMul) { if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL) @@ -12661,14 +13170,14 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B) if (isFMulNegTwo(N0)) { SDValue B = N0.getOperand(0); - SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags); - return DAG.getNode(ISD::FSUB, DL, VT, N1, Add, Flags); + SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B); + return DAG.getNode(ISD::FSUB, DL, VT, N1, Add); } // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B) if (isFMulNegTwo(N1)) { SDValue B = N1.getOperand(0); - SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags); - return DAG.getNode(ISD::FSUB, DL, VT, N0, Add, Flags); + SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B); + return DAG.getNode(ISD::FSUB, DL, VT, N0, Add); } // No FP constant should be created after legalization as Instruction @@ -12694,9 +13203,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { AllowNewConst) { // fadd (fadd x, c1), c2 -> fadd x, c1 + c2 if (N1CFP && N0.getOpcode() == ISD::FADD && - isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { - SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, Flags); - return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC, Flags); + DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { + SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1); + return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC); } // We can fold chains of FADD's of the same value into multiplications. @@ -12704,14 +13213,14 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // of rounding steps. if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { if (N0.getOpcode() == ISD::FMUL) { - bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); - bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); + bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); + bool CFP01 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); // (fadd (fmul x, c), x) -> (fmul x, c+1) if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), - DAG.getConstantFP(1.0, DL, VT), Flags); - return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags); + DAG.getConstantFP(1.0, DL, VT)); + return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP); } // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2) @@ -12719,20 +13228,20 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { N1.getOperand(0) == N1.getOperand(1) && N0.getOperand(0) == N1.getOperand(0)) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), - DAG.getConstantFP(2.0, DL, VT), Flags); - return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags); + DAG.getConstantFP(2.0, DL, VT)); + return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP); } } if (N1.getOpcode() == ISD::FMUL) { - bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); - bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); + bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); + bool CFP11 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); // (fadd x, (fmul x, c)) -> (fmul x, c+1) if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), - DAG.getConstantFP(1.0, DL, VT), Flags); - return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags); + DAG.getConstantFP(1.0, DL, VT)); + return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP); } // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2) @@ -12740,28 +13249,28 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { N0.getOperand(0) == N0.getOperand(1) && N1.getOperand(0) == N0.getOperand(0)) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), - DAG.getConstantFP(2.0, DL, VT), Flags); - return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags); + DAG.getConstantFP(2.0, DL, VT)); + return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP); } } if (N0.getOpcode() == ISD::FADD) { - bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); + bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); // (fadd (fadd x, x), x) -> (fmul x, 3.0) if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) && (N0.getOperand(0) == N1)) { - return DAG.getNode(ISD::FMUL, DL, VT, - N1, DAG.getConstantFP(3.0, DL, VT), Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N1, + DAG.getConstantFP(3.0, DL, VT)); } } if (N1.getOpcode() == ISD::FADD) { - bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); + bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); // (fadd x, (fadd x, x)) -> (fmul x, 3.0) if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && N1.getOperand(0) == N0) { - return DAG.getNode(ISD::FMUL, DL, VT, - N0, DAG.getConstantFP(3.0, DL, VT), Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N0, + DAG.getConstantFP(3.0, DL, VT)); } } @@ -12771,7 +13280,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { N1.getOperand(0) == N1.getOperand(1) && N0.getOperand(0) == N1.getOperand(0)) { return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), - DAG.getConstantFP(4.0, DL, VT), Flags); + DAG.getConstantFP(4.0, DL, VT)); } } } // enable-unsafe-fp-math @@ -12784,6 +13293,33 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSTRICT_FADD(SDNode *N) { + SDValue Chain = N->getOperand(0); + SDValue N0 = N->getOperand(1); + SDValue N1 = N->getOperand(2); + EVT VT = N->getValueType(0); + EVT ChainVT = N->getValueType(1); + SDLoc DL(N); + SelectionDAG::FlagInserter FlagsInserter(DAG, N); + + // fold (strict_fadd A, (fneg B)) -> (strict_fsub A, B) + if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT)) + if (SDValue NegN1 = TLI.getCheaperNegatedExpression( + N1, DAG, LegalOperations, ForCodeSize)) { + return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT), + {Chain, N0, NegN1}); + } + + // fold (strict_fadd (fneg A), B) -> (strict_fsub B, A) + if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT)) + if (SDValue NegN0 = TLI.getCheaperNegatedExpression( + N0, DAG, LegalOperations, ForCodeSize)) { + return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT), + {Chain, N1, NegN0}); + } + return SDValue(); +} + SDValue DAGCombiner::visitFSUB(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -12793,6 +13329,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); + SelectionDAG::FlagInserter FlagsInserter(DAG, N); if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) return R; @@ -12804,7 +13341,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { // fold (fsub c1, c2) -> c1-c2 if (N0CFP && N1CFP) - return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags); + return DAG.getNode(ISD::FSUB, DL, VT, N0, N1); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -12824,18 +13361,21 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { } // (fsub -0.0, N1) -> -N1 - // NOTE: It is safe to transform an FSUB(-0.0,X) into an FNEG(X), since the - // FSUB does not specify the sign bit of a NaN. Also note that for - // the same reason, the inverse transform is not safe, unless fast math - // flags are in play. if (N0CFP && N0CFP->isZero()) { if (N0CFP->isNegative() || (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) { - if (SDValue NegN1 = - TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize)) - return NegN1; - if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) - return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags); + // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are + // flushed to zero, unless all users treat denorms as zero (DAZ). + // FIXME: This transform will change the sign of a NaN and the behavior + // of a signaling NaN. It is only valid when a NoNaN flag is present. + DenormalMode DenormMode = DAG.getDenormalMode(VT); + if (DenormMode == DenormalMode::getIEEE()) { + if (SDValue NegN1 = + TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize)) + return NegN1; + if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) + return DAG.getNode(ISD::FNEG, DL, VT, N1); + } } } @@ -12844,16 +13384,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { N1.getOpcode() == ISD::FADD) { // X - (X + Y) -> -Y if (N0 == N1->getOperand(0)) - return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1), Flags); + return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1)); // X - (Y + X) -> -Y if (N0 == N1->getOperand(1)) - return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0), Flags); + return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0)); } // fold (fsub A, (fneg B)) -> (fadd A, B) if (SDValue NegN1 = TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize)) - return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1, Flags); + return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1); // FSUB -> FMA combines: if (SDValue Fused = visitFSUBForFMACombine(N)) { @@ -12873,6 +13413,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); + SelectionDAG::FlagInserter FlagsInserter(DAG, N); if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) return R; @@ -12886,35 +13427,28 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { // fold (fmul c1, c2) -> c1*c2 if (N0CFP && N1CFP) - return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N0, N1); // canonicalize constant to RHS - if (isConstantFPBuildVectorOrConstantFP(N0) && - !isConstantFPBuildVectorOrConstantFP(N1)) - return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags); + if (DAG.isConstantFPBuildVectorOrConstantFP(N0) && + !DAG.isConstantFPBuildVectorOrConstantFP(N1)) + return DAG.getNode(ISD::FMUL, DL, VT, N1, N0); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; - if ((Options.NoNaNsFPMath && Options.NoSignedZerosFPMath) || - (Flags.hasNoNaNs() && Flags.hasNoSignedZeros())) { - // fold (fmul A, 0) -> 0 - if (N1CFP && N1CFP->isZero()) - return N1; - } - if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) { // fmul (fmul X, C1), C2 -> fmul X, C1 * C2 - if (isConstantFPBuildVectorOrConstantFP(N1) && + if (DAG.isConstantFPBuildVectorOrConstantFP(N1) && N0.getOpcode() == ISD::FMUL) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); // Avoid an infinite loop by making sure that N00 is not a constant // (the inner multiply has not been constant folded yet). - if (isConstantFPBuildVectorOrConstantFP(N01) && - !isConstantFPBuildVectorOrConstantFP(N00)) { - SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags); - return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags); + if (DAG.isConstantFPBuildVectorOrConstantFP(N01) && + !DAG.isConstantFPBuildVectorOrConstantFP(N00)) { + SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1); + return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts); } } @@ -12923,14 +13457,14 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() && N0.getOperand(0) == N0.getOperand(1)) { const SDValue Two = DAG.getConstantFP(2.0, DL, VT); - SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags); - return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags); + SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1); + return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts); } } // fold (fmul X, 2.0) -> (fadd X, X) if (N1CFP && N1CFP->isExactlyValue(+2.0)) - return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags); + return DAG.getNode(ISD::FADD, DL, VT, N0, N0); // fold (fmul X, -1.0) -> (fneg X) if (N1CFP && N1CFP->isExactlyValue(-1.0)) @@ -12949,7 +13483,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (NegN0 && NegN1 && (CostN0 == TargetLowering::NegatibleCost::Cheaper || CostN1 == TargetLowering::NegatibleCost::Cheaper)) - return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags); + return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1); // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X)) // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X) @@ -13015,10 +13549,11 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; - // FMA nodes have flags that propagate to the created nodes. - const SDNodeFlags Flags = N->getFlags(); - bool UnsafeFPMath = Options.UnsafeFPMath || isContractable(N); + SelectionDAG::FlagInserter FlagsInserter(DAG, N); + + bool UnsafeFPMath = + Options.UnsafeFPMath || N->getFlags().hasAllowReassociation(); // Constant fold FMA. if (isa<ConstantFPSDNode>(N0) && @@ -13039,7 +13574,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { if (NegN0 && NegN1 && (CostN0 == TargetLowering::NegatibleCost::Cheaper || CostN1 == TargetLowering::NegatibleCost::Cheaper)) - return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags); + return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2); if (UnsafeFPMath) { if (N0CFP && N0CFP->isZero()) @@ -13047,51 +13582,45 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { if (N1CFP && N1CFP->isZero()) return N2; } - // TODO: The FMA node should have flags that propagate to these nodes. + if (N0CFP && N0CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); if (N1CFP && N1CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); // Canonicalize (fma c, x, y) -> (fma x, c, y) - if (isConstantFPBuildVectorOrConstantFP(N0) && - !isConstantFPBuildVectorOrConstantFP(N1)) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0) && + !DAG.isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); if (UnsafeFPMath) { // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && - isConstantFPBuildVectorOrConstantFP(N1) && - isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { + DAG.isConstantFPBuildVectorOrConstantFP(N1) && + DAG.isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { return DAG.getNode(ISD::FMUL, DL, VT, N0, - DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1), - Flags), Flags); + DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1))); } // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) if (N0.getOpcode() == ISD::FMUL && - isConstantFPBuildVectorOrConstantFP(N1) && - isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { - return DAG.getNode(ISD::FMA, DL, VT, - N0.getOperand(0), - DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1), - Flags), + DAG.isConstantFPBuildVectorOrConstantFP(N1) && + DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { + return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), + DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1)), N2); } } - // (fma x, 1, y) -> (fadd x, y) // (fma x, -1, y) -> (fadd (fneg x), y) if (N1CFP) { if (N1CFP->isExactlyValue(1.0)) - // TODO: The FMA node should have flags that propagate to this node. return DAG.getNode(ISD::FADD, DL, VT, N0, N2); if (N1CFP->isExactlyValue(-1.0) && (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0); AddToWorklist(RHSNeg.getNode()); - // TODO: The FMA node should have flags that propagate to this node. return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg); } @@ -13101,25 +13630,23 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT, ForCodeSize)))) { return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), - DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2); + DAG.getNode(ISD::FNEG, DL, VT, N1), N2); } } if (UnsafeFPMath) { // (fma x, c, x) -> (fmul x, (c+1)) if (N1CFP && N0 == N2) { - return DAG.getNode(ISD::FMUL, DL, VT, N0, - DAG.getNode(ISD::FADD, DL, VT, N1, - DAG.getConstantFP(1.0, DL, VT), Flags), - Flags); + return DAG.getNode( + ISD::FMUL, DL, VT, N0, + DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(1.0, DL, VT))); } // (fma x, c, (fneg x)) -> (fmul x, (c-1)) if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) { - return DAG.getNode(ISD::FMUL, DL, VT, N0, - DAG.getNode(ISD::FADD, DL, VT, N1, - DAG.getConstantFP(-1.0, DL, VT), Flags), - Flags); + return DAG.getNode( + ISD::FMUL, DL, VT, N0, + DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(-1.0, DL, VT))); } } @@ -13128,7 +13655,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { if (!TLI.isFNegFree(VT)) if (SDValue Neg = TLI.getCheaperNegatedExpression( SDValue(N, 0), DAG, LegalOperations, ForCodeSize)) - return DAG.getNode(ISD::FNEG, DL, VT, Neg, Flags); + return DAG.getNode(ISD::FNEG, DL, VT, Neg); return SDValue(); } @@ -13149,14 +13676,13 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { return SDValue(); // Skip if current node is a reciprocal/fneg-reciprocal. - SDValue N0 = N->getOperand(0); + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true); if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0))) return SDValue(); // Exit early if the target does not want this transform or if there can't // possibly be enough uses of the divisor to make the transform worthwhile. - SDValue N1 = N->getOperand(1); unsigned MinUses = TLI.combineRepeatedFPDivisors(); // For splat vectors, scale the number of uses by the splat factor. If we can @@ -13174,6 +13700,13 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { SetVector<SDNode *> Users; for (auto *U : N1->uses()) { if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) { + // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet. + if (U->getOperand(1).getOpcode() == ISD::FSQRT && + U->getOperand(0) == U->getOperand(1).getOperand(0) && + U->getFlags().hasAllowReassociation() && + U->getFlags().hasNoSignedZeros()) + continue; + // This division is eligible for optimization only if global unsafe math // is enabled or if this division allows reciprocal formation. if (UnsafeMath || U->getFlags().hasAllowReciprocal()) @@ -13215,6 +13748,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; SDNodeFlags Flags = N->getFlags(); + SelectionDAG::FlagInserter FlagsInserter(DAG, N); if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) return R; @@ -13226,7 +13760,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { // fold (fdiv c1, c2) -> c1/c2 if (N0CFP && N1CFP) - return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags); + return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -13251,29 +13785,29 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { TLI.isOperationLegal(ISD::ConstantFP, VT) || TLI.isFPImmLegal(Recip, VT, ForCodeSize))) return DAG.getNode(ISD::FMUL, DL, VT, N0, - DAG.getConstantFP(Recip, DL, VT), Flags); + DAG.getConstantFP(Recip, DL, VT)); } // If this FDIV is part of a reciprocal square root, it may be folded // into a target-specific square root estimate instruction. if (N1.getOpcode() == ISD::FSQRT) { if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) - return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); } else if (N1.getOpcode() == ISD::FP_EXTEND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { - if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0), - Flags)) { + if (SDValue RV = + buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) { RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); } } else if (N1.getOpcode() == ISD::FP_ROUND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { - if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0), - Flags)) { + if (SDValue RV = + buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) { RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); } } else if (N1.getOpcode() == ISD::FMUL) { // Look through an FMUL. Even though this won't remove the FDIV directly, @@ -13288,29 +13822,34 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { } if (Sqrt.getNode()) { // If the other multiply operand is known positive, pull it into the - // sqrt. That will eliminate the division if we convert to an estimate: - // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z) - // TODO: Also fold the case where A == Z (fabs is missing). + // sqrt. That will eliminate the division if we convert to an estimate. if (Flags.hasAllowReassociation() && N1.hasOneUse() && - N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse() && - Y.getOpcode() == ISD::FABS && Y.hasOneUse()) { - SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, Y.getOperand(0), - Y.getOperand(0), Flags); - SDValue AAZ = - DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0), Flags); - if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags)) - return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt, Flags); - - // Estimate creation failed. Clean up speculatively created nodes. - recursivelyDeleteUnusedNodes(AAZ.getNode()); + N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse()) { + SDValue A; + if (Y.getOpcode() == ISD::FABS && Y.hasOneUse()) + A = Y.getOperand(0); + else if (Y == Sqrt.getOperand(0)) + A = Y; + if (A) { + // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z) + // X / (A * sqrt(A)) --> X / sqrt(A*A*A) --> X * rsqrt(A*A*A) + SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, A, A); + SDValue AAZ = + DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0)); + if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags)) + return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt); + + // Estimate creation failed. Clean up speculatively created nodes. + recursivelyDeleteUnusedNodes(AAZ.getNode()); + } } // We found a FSQRT, so try to make this fold: // X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y) if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) { - SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y, Flags); + SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y); AddToWorklist(Div.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N0, Div, Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N0, Div); } } } @@ -13321,6 +13860,12 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { return RV; } + // Fold X/Sqrt(X) -> Sqrt(X) + if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) && + (Options.UnsafeFPMath || Flags.hasAllowReassociation())) + if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0)) + return N1; + // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) TargetLowering::NegatibleCost CostN0 = TargetLowering::NegatibleCost::Expensive; @@ -13333,7 +13878,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (NegN0 && NegN1 && (CostN0 == TargetLowering::NegatibleCost::Cheaper || CostN1 == TargetLowering::NegatibleCost::Cheaper)) - return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1, Flags); + return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1); return SDValue(); } @@ -13345,13 +13890,14 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); EVT VT = N->getValueType(0); SDNodeFlags Flags = N->getFlags(); + SelectionDAG::FlagInserter FlagsInserter(DAG, N); if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) return R; // fold (frem c1, c2) -> fmod(c1,c2) if (N0CFP && N1CFP) - return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags()); + return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -13365,7 +13911,7 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) { // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as: // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN - if ((!Options.UnsafeFPMath && !Flags.hasApproximateFuncs()) || + if (!Flags.hasApproximateFuncs() || (!Options.NoInfsFPMath && !Flags.hasNoInfs())) return SDValue(); @@ -13374,6 +13920,10 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) { return SDValue(); // FSQRT nodes have flags that propagate to the created nodes. + // TODO: If this is N0/sqrt(N0), and we reach this node before trying to + // transform the fdiv, we may produce a sub-optimal estimate sequence + // because the reciprocal calculation may not have to filter out a + // 0.0 input. return buildSqrtEstimate(N0, Flags); } @@ -13397,8 +13947,8 @@ static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0); - bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); + bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); + bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); EVT VT = N->getValueType(0); if (N0CFP && N1CFP) // Constant fold @@ -13445,6 +13995,7 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1)); if (!ExponentC) return SDValue(); + SelectionDAG::FlagInserter FlagsInserter(DAG, N); // Try to convert x ** (1/3) into cube root. // TODO: Handle the various flavors of long double. @@ -13471,7 +14022,7 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { DAG.getTargetLoweringInfo().isOperationExpand(ISD::FCBRT, VT))) return SDValue(); - return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0), Flags); + return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0)); } // Try to convert x ** (1/4) and x ** (3/4) into square roots. @@ -13506,12 +14057,12 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { // pow(X, 0.25) --> sqrt(sqrt(X)) SDLoc DL(N); - SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0), Flags); - SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt, Flags); + SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0)); + SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt); if (ExponentIs025) return SqrtSqrt; // pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X)) - return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt, Flags); + return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt); } return SDValue(); @@ -13694,7 +14245,7 @@ SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { return DAG.getUNDEF(VT); // fold (fp_to_sint c1fp) -> c1 - if (isConstantFPBuildVectorOrConstantFP(N0)) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0); return FoldIntToFPToInt(N, DAG); @@ -13709,7 +14260,7 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { return DAG.getUNDEF(VT); // fold (fp_to_uint c1fp) -> c1 - if (isConstantFPBuildVectorOrConstantFP(N0)) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0); return FoldIntToFPToInt(N, DAG); @@ -13781,7 +14332,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { return SDValue(); // fold (fp_extend c1fp) -> c1fp - if (isConstantFPBuildVectorOrConstantFP(N0)) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0); // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op) @@ -13829,7 +14380,7 @@ SDValue DAGCombiner::visitFCEIL(SDNode *N) { EVT VT = N->getValueType(0); // fold (fceil c1) -> fceil(c1) - if (isConstantFPBuildVectorOrConstantFP(N0)) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0); return SDValue(); @@ -13840,7 +14391,7 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) { EVT VT = N->getValueType(0); // fold (ftrunc c1) -> ftrunc(c1) - if (isConstantFPBuildVectorOrConstantFP(N0)) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0); // fold ftrunc (known rounded int x) -> x @@ -13864,19 +14415,19 @@ SDValue DAGCombiner::visitFFLOOR(SDNode *N) { EVT VT = N->getValueType(0); // fold (ffloor c1) -> ffloor(c1) - if (isConstantFPBuildVectorOrConstantFP(N0)) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0); return SDValue(); } -// FIXME: FNEG and FABS have a lot in common; refactor. SDValue DAGCombiner::visitFNEG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SelectionDAG::FlagInserter FlagsInserter(DAG, N); // Constant fold FNEG. - if (isConstantFPBuildVectorOrConstantFP(N0)) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0); if (SDValue NegN0 = @@ -13891,51 +14442,12 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { (DAG.getTarget().Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) { return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1), - N0.getOperand(0), N->getFlags()); - } - - // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading - // constant pool values. - if (!TLI.isFNegFree(VT) && - N0.getOpcode() == ISD::BITCAST && - N0.getNode()->hasOneUse()) { - SDValue Int = N0.getOperand(0); - EVT IntVT = Int.getValueType(); - if (IntVT.isInteger() && !IntVT.isVector()) { - APInt SignMask; - if (N0.getValueType().isVector()) { - // For a vector, get a mask such as 0x80... per scalar element - // and splat it. - SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits()); - SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); - } else { - // For a scalar, just generate 0x80... - SignMask = APInt::getSignMask(IntVT.getSizeInBits()); - } - SDLoc DL0(N0); - Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int, - DAG.getConstant(SignMask, DL0, IntVT)); - AddToWorklist(Int.getNode()); - return DAG.getBitcast(VT, Int); - } - } - - // (fneg (fmul c, x)) -> (fmul -c, x) - if (N0.getOpcode() == ISD::FMUL && - (N0.getNode()->hasOneUse() || !TLI.isFNegFree(VT))) { - ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1)); - if (CFP1) { - APFloat CVal = CFP1->getValueAPF(); - CVal.changeSign(); - if (LegalDAG && (TLI.isFPImmLegal(CVal, VT, ForCodeSize) || - TLI.isOperationLegal(ISD::ConstantFP, VT))) - return DAG.getNode( - ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), - DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)), - N0->getFlags()); - } + N0.getOperand(0)); } + if (SDValue Cast = foldSignChangeInBitcast(N)) + return Cast; + return SDValue(); } @@ -13946,6 +14458,11 @@ static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N, EVT VT = N->getValueType(0); const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); + const SDNodeFlags Flags = N->getFlags(); + unsigned Opc = N->getOpcode(); + bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM; + bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM; + SelectionDAG::FlagInserter FlagsInserter(DAG, N); if (N0CFP && N1CFP) { const APFloat &C0 = N0CFP->getValueAPF(); @@ -13954,10 +14471,39 @@ static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N, } // Canonicalize to constant on RHS. - if (isConstantFPBuildVectorOrConstantFP(N0) && - !isConstantFPBuildVectorOrConstantFP(N1)) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0) && + !DAG.isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); + if (N1CFP) { + const APFloat &AF = N1CFP->getValueAPF(); + + // minnum(X, nan) -> X + // maxnum(X, nan) -> X + // minimum(X, nan) -> nan + // maximum(X, nan) -> nan + if (AF.isNaN()) + return PropagatesNaN ? N->getOperand(1) : N->getOperand(0); + + // In the following folds, inf can be replaced with the largest finite + // float, if the ninf flag is set. + if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) { + // minnum(X, -inf) -> -inf + // maxnum(X, +inf) -> +inf + // minimum(X, -inf) -> -inf if nnan + // maximum(X, +inf) -> +inf if nnan + if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs())) + return N->getOperand(1); + + // minnum(X, +inf) -> X if nnan + // maxnum(X, -inf) -> X if nnan + // minimum(X, +inf) -> X + // maximum(X, -inf) -> X + if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs())) + return N->getOperand(0); + } + } + return SDValue(); } @@ -13982,7 +14528,7 @@ SDValue DAGCombiner::visitFABS(SDNode *N) { EVT VT = N->getValueType(0); // fold (fabs c1) -> fabs(c1) - if (isConstantFPBuildVectorOrConstantFP(N0)) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); // fold (fabs (fabs x)) -> (fabs x) @@ -13994,28 +14540,8 @@ SDValue DAGCombiner::visitFABS(SDNode *N) { if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0)); - // fabs(bitcast(x)) -> bitcast(x & ~sign) to avoid constant pool loads. - if (!TLI.isFAbsFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) { - SDValue Int = N0.getOperand(0); - EVT IntVT = Int.getValueType(); - if (IntVT.isInteger() && !IntVT.isVector()) { - APInt SignMask; - if (N0.getValueType().isVector()) { - // For a vector, get a mask such as 0x7f... per scalar element - // and splat it. - SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits()); - SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); - } else { - // For a scalar, just generate 0x7f... - SignMask = ~APInt::getSignMask(IntVT.getSizeInBits()); - } - SDLoc DL(N0); - Int = DAG.getNode(ISD::AND, DL, IntVT, Int, - DAG.getConstant(SignMask, DL, IntVT)); - AddToWorklist(Int.getNode()); - return DAG.getBitcast(N->getValueType(0), Int); - } - } + if (SDValue Cast = foldSignChangeInBitcast(N)) + return Cast; return SDValue(); } @@ -14025,6 +14551,13 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); + // BRCOND(FREEZE(cond)) is equivalent to BRCOND(cond) (both are + // nondeterministic jumps). + if (N1->getOpcode() == ISD::FREEZE && N1.hasOneUse()) { + return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, + N1->getOperand(0), N2); + } + // If N is a constant we could fold this into a fallthrough or unconditional // branch. However that doesn't happen very often in normal code, because // Instcombine/SimplifyCFG should have handled the available opportunities. @@ -14178,63 +14711,6 @@ SDValue DAGCombiner::visitBR_CC(SDNode *N) { return SDValue(); } -/// Return true if 'Use' is a load or a store that uses N as its base pointer -/// and that N may be folded in the load / store addressing mode. -static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, - SelectionDAG &DAG, - const TargetLowering &TLI) { - EVT VT; - unsigned AS; - - if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) { - if (LD->isIndexed() || LD->getBasePtr().getNode() != N) - return false; - VT = LD->getMemoryVT(); - AS = LD->getAddressSpace(); - } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) { - if (ST->isIndexed() || ST->getBasePtr().getNode() != N) - return false; - VT = ST->getMemoryVT(); - AS = ST->getAddressSpace(); - } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) { - if (LD->isIndexed() || LD->getBasePtr().getNode() != N) - return false; - VT = LD->getMemoryVT(); - AS = LD->getAddressSpace(); - } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) { - if (ST->isIndexed() || ST->getBasePtr().getNode() != N) - return false; - VT = ST->getMemoryVT(); - AS = ST->getAddressSpace(); - } else - return false; - - TargetLowering::AddrMode AM; - if (N->getOpcode() == ISD::ADD) { - AM.HasBaseReg = true; - ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (Offset) - // [reg +/- imm] - AM.BaseOffs = Offset->getSExtValue(); - else - // [reg +/- reg] - AM.Scale = 1; - } else if (N->getOpcode() == ISD::SUB) { - AM.HasBaseReg = true; - ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (Offset) - // [reg +/- imm] - AM.BaseOffs = -Offset->getSExtValue(); - else - // [reg +/- reg] - AM.Scale = 1; - } else - return false; - - return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, - VT.getTypeForEVT(*DAG.getContext()), AS); -} - static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec, bool &IsLoad, bool &IsMasked, SDValue &Ptr, const TargetLowering &TLI) { @@ -14463,16 +14939,13 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { // Therefore, we have: // t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1 - ConstantSDNode *CN = - cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx)); - int X0, X1, Y0, Y1; + auto *CN = cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx)); const APInt &Offset0 = CN->getAPIntValue(); - APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue(); - - X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1; - Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1; - X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1; - Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1; + const APInt &Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue(); + int X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1; + int Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1; + int X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1; + int Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1; unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD; @@ -14664,8 +15137,8 @@ SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) { return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc); } -static inline int numVectorEltsOrZero(EVT T) { - return T.isVector() ? T.getVectorNumElements() : 0; +static inline ElementCount numVectorEltsOrZero(EVT T) { + return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0); } bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) { @@ -14733,6 +15206,24 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { EVT STMemType = ST->getMemoryVT(); EVT STType = ST->getValue().getValueType(); + // There are two cases to consider here: + // 1. The store is fixed width and the load is scalable. In this case we + // don't know at compile time if the store completely envelops the load + // so we abandon the optimisation. + // 2. The store is scalable and the load is fixed width. We could + // potentially support a limited number of cases here, but there has been + // no cost-benefit analysis to prove it's worth it. + bool LdStScalable = LDMemType.isScalableVector(); + if (LdStScalable != STMemType.isScalableVector()) + return SDValue(); + + // If we are dealing with scalable vectors on a big endian platform the + // calculation of offsets below becomes trickier, since we do not know at + // compile time the absolute size of the vector. Until we've done more + // analysis on big-endian platforms it seems better to bail out for now. + if (LdStScalable && DAG.getDataLayout().isBigEndian()) + return SDValue(); + BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); int64_t Offset; @@ -14744,13 +15235,21 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { // the stored value). With Offset=n (for n > 0) the loaded value starts at the // n:th least significant byte of the stored value. if (DAG.getDataLayout().isBigEndian()) - Offset = ((int64_t)STMemType.getStoreSizeInBits() - - (int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset; + Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() - + (int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) / + 8 - + Offset; // Check that the stored value cover all bits that are loaded. - bool STCoversLD = - (Offset >= 0) && - (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits()); + bool STCoversLD; + + TypeSize LdMemSize = LDMemType.getSizeInBits(); + TypeSize StMemSize = STMemType.getSizeInBits(); + if (LdStScalable) + STCoversLD = (Offset == 0) && LdMemSize == StMemSize; + else + STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <= + StMemSize.getFixedSize()); auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue { if (LD->isIndexed()) { @@ -14771,15 +15270,15 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { // Memory as copy space (potentially masked). if (Offset == 0 && LDType == STType && STMemType == LDMemType) { // Simple case: Direct non-truncating forwarding - if (LDType.getSizeInBits() == LDMemType.getSizeInBits()) + if (LDType.getSizeInBits() == LdMemSize) return ReplaceLd(LD, ST->getValue(), Chain); // Can we model the truncate and extension with an and mask? if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() && !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) { // Mask to size of LDMemType auto Mask = - DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(), - STMemType.getSizeInBits()), + DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(), + StMemSize.getFixedSize()), SDLoc(ST), STType); auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask); return ReplaceLd(LD, Val, Chain); @@ -15602,8 +16101,6 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo, // Figure out the offset for the store and the alignment of the access. unsigned StOffset; - unsigned NewAlign = St->getAlignment(); - if (DAG.getDataLayout().isLittleEndian()) StOffset = ByteShift; else @@ -15612,8 +16109,7 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo, SDValue Ptr = St->getBasePtr(); if (StOffset) { SDLoc DL(IVal); - Ptr = DAG.getMemBasePlusOffset(Ptr, StOffset, DL); - NewAlign = MinAlign(NewAlign, StOffset); + Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(StOffset), DL); } // Truncate down to the new size. @@ -15622,7 +16118,8 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo, ++OpsNarrowed; return DAG .getStore(St->getChain(), SDLoc(St), IVal, Ptr, - St->getPointerInfo().getWithOffset(StOffset), NewAlign); + St->getPointerInfo().getWithOffset(StOffset), + St->getOriginalAlign()); } /// Look for sequence of load / op / store where op is one of 'or', 'xor', and @@ -15726,7 +16223,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { if (NewAlign < DAG.getDataLayout().getABITypeAlign(NewVTTy)) return SDValue(); - SDValue NewPtr = DAG.getMemBasePlusOffset(Ptr, PtrOff, SDLoc(LD)); + SDValue NewPtr = + DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(PtrOff), SDLoc(LD)); SDValue NewLD = DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, @@ -16034,9 +16532,9 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts( // make sure we use trunc store if it's necessary to be legal. SDValue NewStore; if (!UseTrunc) { - NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), - FirstInChain->getAlignment()); + NewStore = + DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), FirstInChain->getAlign()); } else { // Must be realized as a trunc store EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); @@ -16048,8 +16546,7 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts( NewStore = DAG.getTruncStore( NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/, - FirstInChain->getAlignment(), - FirstInChain->getMemOperand()->getFlags()); + FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags()); } // Replace all merged stores with the new store. @@ -16064,23 +16561,19 @@ void DAGCombiner::getStoreMergeCandidates( StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes, SDNode *&RootNode) { // This holds the base pointer, index, and the offset in bytes from the base - // pointer. + // pointer. We must have a base and an offset. Do not handle stores to undef + // base pointers. BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); - EVT MemVT = St->getMemoryVT(); - - SDValue Val = peekThroughBitcasts(St->getValue()); - // We must have a base and an offset. - if (!BasePtr.getBase().getNode()) - return; - - // Do not handle stores to undef base pointers. - if (BasePtr.getBase().isUndef()) + if (!BasePtr.getBase().getNode() || BasePtr.getBase().isUndef()) return; + SDValue Val = peekThroughBitcasts(St->getValue()); StoreSource StoreSrc = getStoreSource(Val); assert(StoreSrc != StoreSource::Unknown && "Expected known source for store"); - BaseIndexOffset LBasePtr; + // Match on loadbaseptr if relevant. + EVT MemVT = St->getMemoryVT(); + BaseIndexOffset LBasePtr; EVT LoadVT; if (StoreSrc == StoreSource::Load) { auto *Ld = cast<LoadSDNode>(Val); @@ -16101,7 +16594,7 @@ void DAGCombiner::getStoreMergeCandidates( int64_t &Offset) -> bool { // The memory operands must not be volatile/indexed/atomic. // TODO: May be able to relax for unordered atomics (see D66309) - if (!Other->isSimple() || Other->isIndexed()) + if (!Other->isSimple() || Other->isIndexed()) return false; // Don't mix temporal stores with non-temporal stores. if (St->isNonTemporal() != Other->isNonTemporal()) @@ -16110,37 +16603,38 @@ void DAGCombiner::getStoreMergeCandidates( // Allow merging constants of different types as integers. bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) : Other->getMemoryVT() != MemVT; - if (StoreSrc == StoreSource::Load) { + switch (StoreSrc) { + case StoreSource::Load: { if (NoTypeMatch) return false; - // The Load's Base Ptr must also match - if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(OtherBC)) { - BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG); - if (LoadVT != OtherLd->getMemoryVT()) - return false; - // Loads must only have one use. - if (!OtherLd->hasNUsesOfValue(1, 0)) - return false; - // The memory operands must not be volatile/indexed/atomic. - // TODO: May be able to relax for unordered atomics (see D66309) - if (!OtherLd->isSimple() || - OtherLd->isIndexed()) - return false; - // Don't mix temporal loads with non-temporal loads. - if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal()) - return false; - if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) - return false; - } else + // The Load's Base Ptr must also match. + auto *OtherLd = dyn_cast<LoadSDNode>(OtherBC); + if (!OtherLd) + return false; + BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG); + if (LoadVT != OtherLd->getMemoryVT()) + return false; + // Loads must only have one use. + if (!OtherLd->hasNUsesOfValue(1, 0)) + return false; + // The memory operands must not be volatile/indexed/atomic. + // TODO: May be able to relax for unordered atomics (see D66309) + if (!OtherLd->isSimple() || OtherLd->isIndexed()) return false; + // Don't mix temporal loads with non-temporal loads. + if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal()) + return false; + if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) + return false; + break; } - if (StoreSrc == StoreSource::Constant) { + case StoreSource::Constant: if (NoTypeMatch) return false; if (!(isa<ConstantSDNode>(OtherBC) || isa<ConstantFPSDNode>(OtherBC))) return false; - } - if (StoreSrc == StoreSource::Extract) { + break; + case StoreSource::Extract: // Do not merge truncated stores here. if (Other->isTruncatingStore()) return false; @@ -16149,6 +16643,9 @@ void DAGCombiner::getStoreMergeCandidates( if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT && OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR) return false; + break; + default: + llvm_unreachable("Unhandled store source for merging"); } Ptr = BaseIndexOffset::match(Other, DAG); return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); @@ -16159,11 +16656,22 @@ void DAGCombiner::getStoreMergeCandidates( auto OverLimitInDependenceCheck = [&](SDNode *StoreNode, SDNode *RootNode) -> bool { auto RootCount = StoreRootCountMap.find(StoreNode); - if (RootCount != StoreRootCountMap.end() && - RootCount->second.first == RootNode && - RootCount->second.second > StoreMergeDependenceLimit) - return true; - return false; + return RootCount != StoreRootCountMap.end() && + RootCount->second.first == RootNode && + RootCount->second.second > StoreMergeDependenceLimit; + }; + + auto TryToAddCandidate = [&](SDNode::use_iterator UseIter) { + // This must be a chain use. + if (UseIter.getOperandNo() != 0) + return; + if (auto *OtherStore = dyn_cast<StoreSDNode>(*UseIter)) { + BaseIndexOffset Ptr; + int64_t PtrDiff; + if (CandidateMatch(OtherStore, Ptr, PtrDiff) && + !OverLimitInDependenceCheck(OtherStore, RootNode)) + StoreNodes.push_back(MemOpLink(OtherStore, PtrDiff)); + } }; // We looking for a root node which is an ancestor to all mergable @@ -16185,31 +16693,21 @@ void DAGCombiner::getStoreMergeCandidates( RootNode = St->getChain().getNode(); unsigned NumNodesExplored = 0; - if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) { + const unsigned MaxSearchNodes = 1024; + if (auto *Ldn = dyn_cast<LoadSDNode>(RootNode)) { RootNode = Ldn->getChain().getNode(); for (auto I = RootNode->use_begin(), E = RootNode->use_end(); - I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored) - if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain + I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) { + if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) { // walk down chain for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) - if (I2.getOperandNo() == 0) - if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) { - BaseIndexOffset Ptr; - int64_t PtrDiff; - if (CandidateMatch(OtherST, Ptr, PtrDiff) && - !OverLimitInDependenceCheck(OtherST, RootNode)) - StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); - } - } else + TryToAddCandidate(I2); + } + } + } else { for (auto I = RootNode->use_begin(), E = RootNode->use_end(); - I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored) - if (I.getOperandNo() == 0) - if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { - BaseIndexOffset Ptr; - int64_t PtrDiff; - if (CandidateMatch(OtherST, Ptr, PtrDiff) && - !OverLimitInDependenceCheck(OtherST, RootNode)) - StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); - } + I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) + TryToAddCandidate(I); + } } // We need to check that merging these stores does not cause a loop in @@ -16579,7 +17077,7 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, } LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); - unsigned FirstStoreAlign = FirstInChain->getAlignment(); + Align FirstStoreAlign = FirstInChain->getAlign(); LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode); // Scan the memory operations on the chain and find the first @@ -16674,7 +17172,7 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, // the NumElem refers to array/index size. unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1); NumElem = std::min(LastLegalType, NumElem); - unsigned FirstLoadAlign = FirstLoad->getAlignment(); + Align FirstLoadAlign = FirstLoad->getAlign(); if (NumElem < 2) { // We know that candidate stores are in order and of correct @@ -16686,8 +17184,8 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, // can here. unsigned NumSkip = 1; while ((NumSkip < LoadNodes.size()) && - (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) && - (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) + (LoadNodes[NumSkip].MemNode->getAlign() <= FirstLoadAlign) && + (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign)) NumSkip++; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip); @@ -16760,11 +17258,10 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), JointMemOpVT, FirstLoadAlign, LdMMOFlags); - NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, - FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), JointMemOpVT, - FirstInChain->getAlignment(), - FirstInChain->getMemOperand()->getFlags()); + NewStore = DAG.getTruncStore( + NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), JointMemOpVT, + FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags()); } // Transfer chain users from old loads to the new load. @@ -16966,17 +17463,15 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { if (DAG.getDataLayout().isBigEndian()) std::swap(Lo, Hi); - unsigned Alignment = ST->getAlignment(); MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); AAMDNodes AAInfo = ST->getAAInfo(); SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), - ST->getAlignment(), MMOFlags, AAInfo); - Ptr = DAG.getMemBasePlusOffset(Ptr, 4, DL); - Alignment = MinAlign(Alignment, 4U); + ST->getOriginalAlign(), MMOFlags, AAInfo); + Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(4), DL); SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(4), - Alignment, MMOFlags, AAInfo); + ST->getOriginalAlign(), MMOFlags, AAInfo); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1); } @@ -17037,7 +17532,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { return NewST; // Try transforming several stores into STORE (BSWAP). - if (SDValue Store = MatchStoreCombine(ST)) + if (SDValue Store = mergeTruncStores(ST)) return Store; if (ST->isUnindexed()) { @@ -17110,11 +17605,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { !ST1->getBasePtr().isUndef() && // BaseIndexOffset and the code below requires knowing the size // of a vector, so bail out if MemoryVT is scalable. + !ST->getMemoryVT().isScalableVector() && !ST1->getMemoryVT().isScalableVector()) { const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG); const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG); - unsigned STBitSize = ST->getMemoryVT().getSizeInBits(); - unsigned ChainBitSize = ST1->getMemoryVT().getSizeInBits(); + unsigned STBitSize = ST->getMemoryVT().getFixedSizeInBits(); + unsigned ChainBitSize = ST1->getMemoryVT().getFixedSizeInBits(); // If this is a store who's preceding store to a subset of the current // location and no one other node is chained to that store we can // effectively drop the store. Do not remove stores to undef as they may @@ -17185,8 +17681,7 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) { // We walk up the chains to find stores. SmallVector<SDValue, 8> Chains = {N->getOperand(0)}; while (!Chains.empty()) { - SDValue Chain = Chains.back(); - Chains.pop_back(); + SDValue Chain = Chains.pop_back_val(); if (!Chain.hasOneUse()) continue; switch (Chain.getOpcode()) { @@ -17206,11 +17701,16 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) { // TODO: Can relax for unordered atomics (see D66309) if (!ST->isSimple() || ST->isIndexed()) continue; + const TypeSize StoreSize = ST->getMemoryVT().getStoreSize(); + // The bounds of a scalable store are not known until runtime, so this + // store cannot be elided. + if (StoreSize.isScalable()) + continue; const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG); // If we store purely within object bounds just before its lifetime ends, // we can remove the store. if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase, - ST->getMemoryVT().getStoreSizeInBits())) { + StoreSize.getFixedSize() * 8)) { LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump(); dbgs() << "\nwithin LIFETIME_END of : "; LifetimeEndBase.dump(); dbgs() << "\n"); @@ -17309,7 +17809,6 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { return SDValue(); // Start to split store. - unsigned Alignment = ST->getAlignment(); MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); AAMDNodes AAInfo = ST->getAAInfo(); @@ -17322,13 +17821,12 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { SDValue Ptr = ST->getBasePtr(); // Lower value store. SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), - ST->getAlignment(), MMOFlags, AAInfo); - Ptr = DAG.getMemBasePlusOffset(Ptr, HalfValBitSize / 8, DL); + ST->getOriginalAlign(), MMOFlags, AAInfo); + Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(HalfValBitSize / 8), DL); // Higher value store. - SDValue St1 = - DAG.getStore(St0, DL, Hi, Ptr, - ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), - Alignment / 2, MMOFlags, AAInfo); + SDValue St1 = DAG.getStore( + St0, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), + ST->getOriginalAlign(), MMOFlags, AAInfo); return St1; } @@ -17566,6 +18064,13 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, EVT ResultVT = EVE->getValueType(0); EVT VecEltVT = InVecVT.getVectorElementType(); + + // If the vector element type is not a multiple of a byte then we are unable + // to correctly compute an address to load only the extracted element as a + // scalar. + if (!VecEltVT.isByteSized()) + return SDValue(); + Align Alignment = OriginalLoad->getAlign(); Align NewAlign = DAG.getDataLayout().getABITypeAlign( VecEltVT.getTypeForEVT(*DAG.getContext())); @@ -18201,20 +18706,24 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, // operands will all be based off of VecIn1, even those in VecIn2. unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements(); + uint64_t VTSize = VT.getFixedSizeInBits(); + uint64_t InVT1Size = InVT1.getFixedSizeInBits(); + uint64_t InVT2Size = InVT2.getFixedSizeInBits(); + // We can't generate a shuffle node with mismatched input and output types. // Try to make the types match the type of the output. if (InVT1 != VT || InVT2 != VT) { - if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) { + if ((VTSize % InVT1Size == 0) && InVT1 == InVT2) { // If the output vector length is a multiple of both input lengths, // we can concatenate them and pad the rest with undefs. - unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits(); + unsigned NumConcats = VTSize / InVT1Size; assert(NumConcats >= 2 && "Concat needs at least two inputs!"); SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1)); ConcatOps[0] = VecIn1; ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1); VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); VecIn2 = SDValue(); - } else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) { + } else if (InVT1Size == VTSize * 2) { if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems)) return SDValue(); @@ -18227,7 +18736,7 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, // Since we now have shorter input vectors, adjust the offset of the // second vector's start. Vec2Offset = NumElems; - } else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) { + } else if (InVT2Size <= InVT1Size) { // VecIn1 is wider than the output, and we have another, possibly // smaller input. Pad the smaller input with undefs, shuffle at the // input vector width, and extract the output. @@ -18252,8 +18761,7 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, // when we start sorting the vectors by type. return SDValue(); } - } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() && - InVT1.getSizeInBits() == VT.getSizeInBits()) { + } else if (InVT2Size * 2 == VTSize && InVT1Size == VTSize) { SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2)); ConcatOps[0] = VecIn2; VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); @@ -18444,8 +18952,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // Have we seen this input vector before? // The vectors are expected to be tiny (usually 1 or 2 elements), so using // a map back from SDValues to numbers isn't worth it. - unsigned Idx = std::distance( - VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec)); + unsigned Idx = std::distance(VecIn.begin(), find(VecIn, ExtractedFromVec)); if (Idx == VecIn.size()) VecIn.push_back(ExtractedFromVec); @@ -18795,6 +19302,11 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); EVT OpVT = N->getOperand(0).getValueType(); + + // We currently can't generate an appropriate shuffle for a scalable vector. + if (VT.isScalableVector()) + return SDValue(); + int NumElts = VT.getVectorNumElements(); int NumOpElts = OpVT.getVectorNumElements(); @@ -18898,7 +19410,7 @@ static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) { // check the other type in the cast to make sure this is really legal. EVT VT = N->getValueType(0); EVT SrcEltVT = SrcVT.getVectorElementType(); - unsigned NumElts = SrcVT.getVectorElementCount().Min * N->getNumOperands(); + ElementCount NumElts = SrcVT.getVectorElementCount() * N->getNumOperands(); EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); switch (CastOpcode) { @@ -18935,9 +19447,8 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return DAG.getUNDEF(VT); // Optimize concat_vectors where all but the first of the vectors are undef. - if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) { - return Op.isUndef(); - })) { + if (all_of(drop_begin(N->ops()), + [](const SDValue &Op) { return Op.isUndef(); })) { SDValue In = N->getOperand(0); assert(In.getValueType().isVector() && "Must concat vectors"); @@ -19055,11 +19566,14 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return V; // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR - // nodes often generate nop CONCAT_VECTOR nodes. - // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that - // place the incoming vectors at the exact same location. + // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR + // operands and look for a CONCAT operations that place the incoming vectors + // at the exact same location. + // + // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled. SDValue SingleSource = SDValue(); - unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements(); + unsigned PartNumElem = + N->getOperand(0).getValueType().getVectorMinNumElements(); for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDValue Op = N->getOperand(i); @@ -19107,15 +19621,16 @@ static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) { auto *IndexC = dyn_cast<ConstantSDNode>(Index); if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS && V.getOperand(0).getValueType() == SubVT && - (IndexC->getZExtValue() % SubVT.getVectorNumElements()) == 0) { - uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorNumElements(); + (IndexC->getZExtValue() % SubVT.getVectorMinNumElements()) == 0) { + uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorMinNumElements(); return V.getOperand(SubIdx); } return SDValue(); } static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract, - SelectionDAG &DAG) { + SelectionDAG &DAG, + bool LegalOperations) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue BinOp = Extract->getOperand(0); unsigned BinOpcode = BinOp.getOpcode(); @@ -19129,7 +19644,7 @@ static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract, SDValue Index = Extract->getOperand(1); EVT SubVT = Extract->getValueType(0); - if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT)) + if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT, LegalOperations)) return SDValue(); SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT); @@ -19150,11 +19665,12 @@ static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract, /// If we are extracting a subvector produced by a wide binary operator try /// to use a narrow binary operator and/or avoid concatenation and extraction. -static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { +static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG, + bool LegalOperations) { // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share // some of these bailouts with other transforms. - if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG)) + if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG, LegalOperations)) return V; // The extract index must be a constant, so we can map it to a concat operand. @@ -19181,7 +19697,10 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // The binop must be a vector type, so we can extract some fraction of it. EVT WideBVT = BinOp.getValueType(); - if (!WideBVT.isVector()) + // The optimisations below currently assume we are dealing with fixed length + // vectors. It is possible to add support for scalable vectors, but at the + // moment we've done no analysis to prove whether they are profitable or not. + if (!WideBVT.isFixedLengthVector()) return SDValue(); EVT VT = Extract->getValueType(0); @@ -19296,19 +19815,15 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { return SDValue(); unsigned Index = ExtIdx->getZExtValue(); - unsigned NumElts = VT.getVectorNumElements(); + unsigned NumElts = VT.getVectorMinNumElements(); - // If the index is a multiple of the extract element count, we can offset the - // address by the store size multiplied by the subvector index. Otherwise if - // the scalar type is byte sized, we can just use the index multiplied by - // the element size in bytes as the offset. - unsigned Offset; - if (Index % NumElts == 0) - Offset = (Index / NumElts) * VT.getStoreSize(); - else if (VT.getScalarType().isByteSized()) - Offset = Index * VT.getScalarType().getStoreSize(); - else - return SDValue(); + // The definition of EXTRACT_SUBVECTOR states that the index must be a + // multiple of the minimum number of elements in the result type. + assert(Index % NumElts == 0 && "The extract subvector index is not a " + "multiple of the result's element count"); + + // It's fine to use TypeSize here as we know the offset will not be negative. + TypeSize Offset = VT.getStoreSize() * (Index / NumElts); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT)) @@ -19317,13 +19832,21 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { // The narrow load will be offset from the base address of the old load if // we are extracting from something besides index 0 (little-endian). SDLoc DL(Extract); - SDValue BaseAddr = Ld->getBasePtr(); // TODO: Use "BaseIndexOffset" to make this more effective. - SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); + SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), Offset, DL); + + uint64_t StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize()); MachineFunction &MF = DAG.getMachineFunction(); - MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset, - VT.getStoreSize()); + MachineMemOperand *MMO; + if (Offset.isScalable()) { + MachinePointerInfo MPI = + MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()); + MMO = MF.getMachineMemOperand(Ld->getMemOperand(), MPI, StoreSize); + } else + MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset.getFixedSize(), + StoreSize); + SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); DAG.makeEquivalentMemoryOrdering(Ld, NewLd); return NewLd; @@ -19376,8 +19899,9 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { } if ((DestNumElts % SrcNumElts) == 0) { unsigned DestSrcRatio = DestNumElts / SrcNumElts; - if ((NVT.getVectorMinNumElements() % DestSrcRatio) == 0) { - ElementCount NewExtEC = NVT.getVectorElementCount() / DestSrcRatio; + if (NVT.getVectorElementCount().isKnownMultipleOf(DestSrcRatio)) { + ElementCount NewExtEC = + NVT.getVectorElementCount().divideCoefficientBy(DestSrcRatio); EVT ScalarVT = SrcVT.getScalarType(); if ((ExtIdx % DestSrcRatio) == 0) { SDLoc DL(N); @@ -19391,7 +19915,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { V.getOperand(0), NewIndex); return DAG.getBitcast(NVT, NewExtract); } - if (NewExtEC == 1 && + if (NewExtEC.isScalar() && TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) { SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL); SDValue NewExtract = @@ -19496,7 +20020,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { N->getOperand(1)); } - if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG)) + if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations)) return NarrowBOp; if (SimplifyDemandedVectorElts(SDValue(N, 0))) @@ -20274,52 +20798,52 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { } } - // Canonicalize shuffles according to rules: - // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) - // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) - // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) - if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && - N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && - TLI.isTypeLegal(VT)) { - // The incoming shuffle must be of the same type as the result of the - // current shuffle. - assert(N1->getOperand(0).getValueType() == VT && - "Shuffle types don't match"); - - SDValue SV0 = N1->getOperand(0); - SDValue SV1 = N1->getOperand(1); - bool HasSameOp0 = N0 == SV0; - bool IsSV1Undef = SV1.isUndef(); - if (HasSameOp0 || IsSV1Undef || N0 == SV1) - // Commute the operands of this shuffle so that next rule - // will trigger. + if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { + // Canonicalize shuffles according to rules: + // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) + // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) + // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) + if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && + N0.getOpcode() != ISD::VECTOR_SHUFFLE) { + // The incoming shuffle must be of the same type as the result of the + // current shuffle. + assert(N1->getOperand(0).getValueType() == VT && + "Shuffle types don't match"); + + SDValue SV0 = N1->getOperand(0); + SDValue SV1 = N1->getOperand(1); + bool HasSameOp0 = N0 == SV0; + bool IsSV1Undef = SV1.isUndef(); + if (HasSameOp0 || IsSV1Undef || N0 == SV1) + // Commute the operands of this shuffle so merging below will trigger. + return DAG.getCommutedVectorShuffle(*SVN); + } + + // Canonicalize splat shuffles to the RHS to improve merging below. + // shuffle(splat(A,u), shuffle(C,D)) -> shuffle'(shuffle(C,D), splat(A,u)) + if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && + N1.getOpcode() == ISD::VECTOR_SHUFFLE && + cast<ShuffleVectorSDNode>(N0)->isSplat() && + !cast<ShuffleVectorSDNode>(N1)->isSplat()) { return DAG.getCommutedVectorShuffle(*SVN); + } } - // Try to fold according to rules: - // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) - // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) - // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) - // Don't try to fold shuffles with illegal type. - // Only fold if this shuffle is the only user of the other shuffle. - if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) && - Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { - ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0); - + // Compute the combined shuffle mask for a shuffle with SV0 as the first + // operand, and SV1 as the second operand. + // i.e. Merge SVN(OtherSVN, N1) -> shuffle(SV0, SV1, Mask). + auto MergeInnerShuffle = [NumElts](ShuffleVectorSDNode *SVN, + ShuffleVectorSDNode *OtherSVN, SDValue N1, + SDValue &SV0, SDValue &SV1, + SmallVectorImpl<int> &Mask) -> bool { // Don't try to fold splats; they're likely to simplify somehow, or they // might be free. - if (OtherSV->isSplat()) - return SDValue(); + if (OtherSVN->isSplat()) + return false; - // The incoming shuffle must be of the same type as the result of the - // current shuffle. - assert(OtherSV->getOperand(0).getValueType() == VT && - "Shuffle types don't match"); + SV0 = SV1 = SDValue(); + Mask.clear(); - SDValue SV0, SV1; - SmallVector<int, 4> Mask; - // Compute the combined shuffle mask for a shuffle with SV0 as the first - // operand, and SV1 as the second operand. for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx < 0) { @@ -20332,15 +20856,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { if (Idx < (int)NumElts) { // This shuffle index refers to the inner shuffle N0. Lookup the inner // shuffle mask to identify which vector is actually referenced. - Idx = OtherSV->getMaskElt(Idx); + Idx = OtherSVN->getMaskElt(Idx); if (Idx < 0) { // Propagate Undef. Mask.push_back(Idx); continue; } - - CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0) - : OtherSV->getOperand(1); + CurrentVec = (Idx < (int)NumElts) ? OtherSVN->getOperand(0) + : OtherSVN->getOperand(1); } else { // This shuffle index references an element within N1. CurrentVec = N1; @@ -20362,38 +20885,82 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { Mask.push_back(Idx); continue; } + if (!SV1.getNode() || SV1 == CurrentVec) { + // Ok. CurrentVec is the right hand side. + // Update the mask accordingly. + SV1 = CurrentVec; + Mask.push_back(Idx + NumElts); + continue; + } - // Bail out if we cannot convert the shuffle pair into a single shuffle. - if (SV1.getNode() && SV1 != CurrentVec) - return SDValue(); + // Last chance - see if the vector is another shuffle and if it + // uses one of the existing candidate shuffle ops. + if (auto *CurrentSVN = dyn_cast<ShuffleVectorSDNode>(CurrentVec)) { + int InnerIdx = CurrentSVN->getMaskElt(Idx); + if (InnerIdx < 0) { + Mask.push_back(-1); + continue; + } + SDValue InnerVec = (InnerIdx < (int)NumElts) + ? CurrentSVN->getOperand(0) + : CurrentSVN->getOperand(1); + if (InnerVec.isUndef()) { + Mask.push_back(-1); + continue; + } + InnerIdx %= NumElts; + if (InnerVec == SV0) { + Mask.push_back(InnerIdx); + continue; + } + if (InnerVec == SV1) { + Mask.push_back(InnerIdx + NumElts); + continue; + } + } - // Ok. CurrentVec is the right hand side. - // Update the mask accordingly. - SV1 = CurrentVec; - Mask.push_back(Idx + NumElts); + // Bail out if we cannot convert the shuffle pair into a single shuffle. + return false; } + return true; + }; - // Check if all indices in Mask are Undef. In case, propagate Undef. - bool isUndefMask = true; - for (unsigned i = 0; i != NumElts && isUndefMask; ++i) - isUndefMask &= Mask[i] < 0; + // Try to fold according to rules: + // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) + // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) + // Don't try to fold shuffles with illegal type. + // Only fold if this shuffle is the only user of the other shuffle. + if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) && + Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { + ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0); - if (isUndefMask) - return DAG.getUNDEF(VT); + // The incoming shuffle must be of the same type as the result of the + // current shuffle. + assert(OtherSV->getOperand(0).getValueType() == VT && + "Shuffle types don't match"); + + SDValue SV0, SV1; + SmallVector<int, 4> Mask; + if (MergeInnerShuffle(SVN, OtherSV, N1, SV0, SV1, Mask)) { + // Check if all indices in Mask are Undef. In case, propagate Undef. + if (llvm::all_of(Mask, [](int M) { return M < 0; })) + return DAG.getUNDEF(VT); - if (!SV0.getNode()) - SV0 = DAG.getUNDEF(VT); - if (!SV1.getNode()) - SV1 = DAG.getUNDEF(VT); + if (!SV0.getNode()) + SV0 = DAG.getUNDEF(VT); + if (!SV1.getNode()) + SV1 = DAG.getUNDEF(VT); - // Avoid introducing shuffles with illegal mask. - // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) - // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) - // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) - // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2) - // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2) - // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2) - return TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask, DAG); + // Avoid introducing shuffles with illegal mask. + // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) + // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) + // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) + // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2) + // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2) + // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2) + return TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask, DAG); + } } if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG)) @@ -20478,8 +21045,8 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST && N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(0).getOperand(1) == N2 && - N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() == - VT.getVectorNumElements() && + N1.getOperand(0).getOperand(0).getValueType().getVectorElementCount() == + VT.getVectorElementCount() && N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() == VT.getSizeInBits()) { return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0)); @@ -20496,7 +21063,7 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { EVT CN1VT = CN1.getValueType(); if (CN0VT.isVector() && CN1VT.isVector() && CN0VT.getVectorElementType() == CN1VT.getVectorElementType() && - CN0VT.getVectorNumElements() == VT.getVectorNumElements()) { + CN0VT.getVectorElementCount() == VT.getVectorElementCount()) { SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), CN0.getValueType(), CN0, CN1, N2); return DAG.getBitcast(VT, NewINSERT); @@ -20535,7 +21102,7 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { SDLoc DL(N); SDValue NewIdx; LLVMContext &Ctx = *DAG.getContext(); - unsigned NumElts = VT.getVectorNumElements(); + ElementCount NumElts = VT.getVectorElementCount(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) { unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits(); @@ -20543,8 +21110,9 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL); } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) { unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits; - if ((NumElts % Scale) == 0 && (InsIdx % Scale) == 0) { - NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts / Scale); + if (NumElts.isKnownMultipleOf(Scale) && (InsIdx % Scale) == 0) { + NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, + NumElts.divideCoefficientBy(Scale)); NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL); } } @@ -20576,8 +21144,10 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { // If the input vector is a concatenation, and the insert replaces // one of the pieces, we can optimize into a single concat_vectors. if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && - N0.getOperand(0).getValueType() == N1.getValueType()) { - unsigned Factor = N1.getValueType().getVectorNumElements(); + N0.getOperand(0).getValueType() == N1.getValueType() && + N0.getOperand(0).getValueType().isScalableVector() == + N1.getValueType().isScalableVector()) { + unsigned Factor = N1.getValueType().getVectorMinNumElements(); SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end()); Ops[InsIdx / Factor] = N1; return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); @@ -20621,7 +21191,7 @@ SDValue DAGCombiner::visitVECREDUCE(SDNode *N) { unsigned Opcode = N->getOpcode(); // VECREDUCE over 1-element vector is just an extract. - if (VT.getVectorNumElements() == 1) { + if (VT.getVectorElementCount().isScalar()) { SDLoc dl(N); SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0, @@ -20860,7 +21430,8 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { SDValue Z = LHS.getOperand(2); EVT NarrowVT = X.getValueType(); if (NarrowVT == Y.getValueType() && - TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) { + TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT, + LegalOperations)) { // (binop undef, undef) may not return undef, so compute that result. SDLoc DL(N); SDValue VecC = @@ -20873,11 +21444,10 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { // Make sure all but the first op are undef or constant. auto ConcatWithConstantOrUndef = [](SDValue Concat) { return Concat.getOpcode() == ISD::CONCAT_VECTORS && - std::all_of(std::next(Concat->op_begin()), Concat->op_end(), - [](const SDValue &Op) { - return Op.isUndef() || - ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); - }); + all_of(drop_begin(Concat->ops()), [](const SDValue &Op) { + return Op.isUndef() || + ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); + }); }; // The following pattern is likely to emerge with vector reduction ops. Moving @@ -21099,7 +21669,7 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, // It is safe to replace the two loads if they have different alignments, // but the new load must be the minimum (most restrictive) alignment of the // inputs. - unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment()); + Align Alignment = std::min(LLD->getAlign(), RLD->getAlign()); MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags(); if (!RLD->isInvariant()) MMOFlags &= ~MachineMemOperand::MOInvariant; @@ -21205,6 +21775,46 @@ SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, return DAG.getNode(ISD::AND, DL, AType, Shift, N2); } +// Transform (fneg/fabs (bitconvert x)) to avoid loading constant pool values. +SDValue DAGCombiner::foldSignChangeInBitcast(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + bool IsFabs = N->getOpcode() == ISD::FABS; + bool IsFree = IsFabs ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT); + + if (IsFree || N0.getOpcode() != ISD::BITCAST || !N0.hasOneUse()) + return SDValue(); + + SDValue Int = N0.getOperand(0); + EVT IntVT = Int.getValueType(); + + // The operand to cast should be integer. + if (!IntVT.isInteger() || IntVT.isVector()) + return SDValue(); + + // (fneg (bitconvert x)) -> (bitconvert (xor x sign)) + // (fabs (bitconvert x)) -> (bitconvert (and x ~sign)) + APInt SignMask; + if (N0.getValueType().isVector()) { + // For vector, create a sign mask (0x80...) or its inverse (for fabs, + // 0x7f...) per element and splat it. + SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits()); + if (IsFabs) + SignMask = ~SignMask; + SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); + } else { + // For scalar, just use the sign mask (0x80... or the inverse, 0x7f...) + SignMask = APInt::getSignMask(IntVT.getSizeInBits()); + if (IsFabs) + SignMask = ~SignMask; + } + SDLoc DL(N0); + Int = DAG.getNode(IsFabs ? ISD::AND : ISD::XOR, DL, IntVT, Int, + DAG.getConstant(SignMask, DL, IntVT)); + AddToWorklist(Int.getNode()); + return DAG.getBitcast(VT, Int); +} + /// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)" /// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0 /// in it. This may be a win when the constant is not otherwise available @@ -21486,9 +22096,8 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) { /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V). SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { EVT VT = V.getValueType(); - unsigned EltBits = VT.getScalarSizeInBits(); SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V); - SDValue Base = DAG.getConstant(EltBits - 1, DL, VT); + SDValue Base = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT); SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz); return LogBase2; } @@ -21666,37 +22275,21 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, Reciprocal)) { AddToWorklist(Est.getNode()); - if (Iterations) { + if (Iterations) Est = UseOneConstNR ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal) : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); - - if (!Reciprocal) { - // The estimate is now completely wrong if the input was exactly 0.0 or - // possibly a denormal. Force the answer to 0.0 for those cases. - SDLoc DL(Op); - EVT CCVT = getSetCCResultType(VT); - ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; - DenormalMode DenormMode = DAG.getDenormalMode(VT); - if (DenormMode.Input == DenormalMode::IEEE) { - // This is specifically a check for the handling of denormal inputs, - // not the result. - - // fabs(X) < SmallestNormal ? 0.0 : Est - const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); - APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); - SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); - SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); - Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est); - } else { - // X == 0.0 ? 0.0 : Est - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); - Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est); - } - } + if (!Reciprocal) { + SDLoc DL(Op); + // Try the target specific test first. + SDValue Test = TLI.getSqrtInputTest(Op, DAG, DAG.getDenormalMode(VT)); + + // The estimate is now completely wrong if the input was exactly 0.0 or + // possibly a denormal. Force the answer to 0.0 or value provided by + // target for those cases. + Est = DAG.getNode( + Test.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, + Test, TLI.getSqrtResultForDenormInput(Op, DAG), Est); } return Est; } |