diff options
Diffstat (limited to 'lib/CodeGen/SelectionDAG/DAGCombiner.cpp')
-rw-r--r-- | lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3271 |
1 files changed, 2357 insertions, 914 deletions
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ff5505c97721..49c922f560fa 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1,9 +1,8 @@ //===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -112,6 +111,10 @@ static cl::opt<bool> MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), cl::desc("DAG combiner may split indexing from loads")); +static cl::opt<unsigned> TokenFactorInlineLimit( + "combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048), + cl::desc("Limit the number of operands to inline for Token Factors")); + namespace { class DAGCombiner { @@ -138,6 +141,10 @@ namespace { /// them) when they are deleted from the underlying DAG. It relies on /// stable indices of nodes within the worklist. DenseMap<SDNode *, unsigned> WorklistMap; + /// This records all nodes attempted to add to the worklist since we + /// considered a new worklist entry. As we keep do not add duplicate nodes + /// in the worklist, this is different from the tail of the worklist. + SmallSetVector<SDNode *, 32> PruningList; /// Set of nodes which have been combined (at least once). /// @@ -155,6 +162,37 @@ namespace { AddToWorklist(Node); } + // Prune potentially dangling nodes. This is called after + // any visit to a node, but should also be called during a visit after any + // failed combine which may have created a DAG node. + void clearAddedDanglingWorklistEntries() { + // Check any nodes added to the worklist to see if they are prunable. + while (!PruningList.empty()) { + auto *N = PruningList.pop_back_val(); + if (N->use_empty()) + recursivelyDeleteUnusedNodes(N); + } + } + + SDNode *getNextWorklistEntry() { + // Before we do any work, remove nodes that are not in use. + clearAddedDanglingWorklistEntries(); + SDNode *N = nullptr; + // The Worklist holds the SDNodes in order, but it may contain null + // entries. + while (!N && !Worklist.empty()) { + N = Worklist.pop_back_val(); + } + + if (N) { + bool GoodWorklistEntry = WorklistMap.erase(N); + (void)GoodWorklistEntry; + assert(GoodWorklistEntry && + "Found a worklist entry without a corresponding map entry!"); + } + return N; + } + /// Call the node-specific routine that folds each particular type of node. SDValue visit(SDNode *N); @@ -162,7 +200,7 @@ namespace { DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) { - ForCodeSize = DAG.getMachineFunction().getFunction().optForSize(); + ForCodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); MaximumLegalStoreInBits = 0; for (MVT VT : MVT::all_valuetypes()) @@ -172,6 +210,11 @@ namespace { MaximumLegalStoreInBits = VT.getSizeInBits(); } + void ConsiderForPruning(SDNode *N) { + // Mark this for potential pruning. + PruningList.insert(N); + } + /// Add to the worklist making sure its instance is at the back (next to be /// processed.) void AddToWorklist(SDNode *N) { @@ -183,6 +226,8 @@ namespace { if (N->getOpcode() == ISD::HANDLENODE) return; + ConsiderForPruning(N); + if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second) Worklist.push_back(N); } @@ -190,6 +235,7 @@ namespace { /// Remove all instances of N from the worklist. void removeFromWorklist(SDNode *N) { CombinedNodes.erase(N); + PruningList.remove(N); auto It = WorklistMap.find(N); if (It == WorklistMap.end()) @@ -229,8 +275,15 @@ namespace { /// If so, return true. bool SimplifyDemandedBits(SDValue Op) { unsigned BitWidth = Op.getScalarValueSizeInBits(); - APInt Demanded = APInt::getAllOnesValue(BitWidth); - return SimplifyDemandedBits(Op, Demanded); + APInt DemandedBits = APInt::getAllOnesValue(BitWidth); + return SimplifyDemandedBits(Op, DemandedBits); + } + + bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) { + EVT VT = Op.getValueType(); + unsigned NumElts = VT.isVector() ? VT.getVectorNumElements() : 1; + APInt DemandedElts = APInt::getAllOnesValue(NumElts); + return SimplifyDemandedBits(Op, DemandedBits, DemandedElts); } /// Check the specified vector node value to see if it can be simplified or @@ -238,12 +291,13 @@ namespace { /// elements. If so, return true. bool SimplifyDemandedVectorElts(SDValue Op) { unsigned NumElts = Op.getValueType().getVectorNumElements(); - APInt Demanded = APInt::getAllOnesValue(NumElts); - return SimplifyDemandedVectorElts(Op, Demanded); + APInt DemandedElts = APInt::getAllOnesValue(NumElts); + return SimplifyDemandedVectorElts(Op, DemandedElts); } - bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded); - bool SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded, + bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts); + bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts, bool AssumeSingleUse = false); bool CombineToPreIndexedLoadStore(SDNode *N); @@ -291,15 +345,16 @@ namespace { SDValue visitTokenFactor(SDNode *N); SDValue visitMERGE_VALUES(SDNode *N); SDValue visitADD(SDNode *N); - SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference); + SDValue visitADDLike(SDNode *N); + SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference); SDValue visitSUB(SDNode *N); SDValue visitADDSAT(SDNode *N); SDValue visitSUBSAT(SDNode *N); SDValue visitADDC(SDNode *N); - SDValue visitUADDO(SDNode *N); + SDValue visitADDO(SDNode *N); SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N); SDValue visitSUBC(SDNode *N); - SDValue visitUSUBO(SDNode *N); + SDValue visitSUBO(SDNode *N); SDValue visitADDE(SDNode *N); SDValue visitADDCARRY(SDNode *N); SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N); @@ -316,8 +371,7 @@ namespace { SDValue visitMULHS(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N); - SDValue visitSMULO(SDNode *N); - SDValue visitUMULO(SDNode *N); + SDValue visitMULO(SDNode *N); SDValue visitIMINMAX(SDNode *N); SDValue visitAND(SDNode *N); SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N); @@ -386,6 +440,7 @@ namespace { SDValue replaceStoreOfFPConstant(StoreSDNode *ST); SDValue visitSTORE(SDNode *N); + SDValue visitLIFETIME_END(SDNode *N); SDValue visitINSERT_VECTOR_ELT(SDNode *N); SDValue visitEXTRACT_VECTOR_ELT(SDNode *N); SDValue visitBUILD_VECTOR(SDNode *N); @@ -400,13 +455,19 @@ namespace { SDValue visitMSCATTER(SDNode *N); SDValue visitFP_TO_FP16(SDNode *N); SDValue visitFP16_TO_FP(SDNode *N); + SDValue visitVECREDUCE(SDNode *N); SDValue visitFADDForFMACombine(SDNode *N); SDValue visitFSUBForFMACombine(SDNode *N); SDValue visitFMULForFMADistributiveCombine(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); - SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, + bool reassociationCanBreakAddressingModePattern(unsigned Opc, + const SDLoc &DL, SDValue N0, + SDValue N1); + SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0, + SDValue N1); + SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SDNodeFlags Flags); SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt); @@ -466,6 +527,7 @@ namespace { const SDLoc &DL); SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue MatchLoadCombine(SDNode *N); + SDValue MatchStoreCombine(StoreSDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); @@ -475,7 +537,8 @@ namespace { SDValue reduceBuildVecToShuffle(SDNode *N); SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef<int> VectorMask, SDValue VecIn1, - SDValue VecIn2, unsigned LeftIdx); + SDValue VecIn2, unsigned LeftIdx, + bool DidSplitVec); SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast); /// Walk up chain skipping non-aliasing memory nodes, @@ -484,7 +547,7 @@ namespace { SmallVectorImpl<SDValue> &Aliases); /// Return true if there is any possibility that the two addresses overlap. - bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const; + bool isAlias(SDNode *Op0, SDNode *Op1) const; /// Walk up chain skipping non-aliasing memory nodes, looking for a better /// chain (aliasing node.) @@ -642,6 +705,18 @@ public: } }; +class WorklistInserter : public SelectionDAG::DAGUpdateListener { + DAGCombiner &DC; + +public: + explicit WorklistInserter(DAGCombiner &dc) + : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} + + // FIXME: Ideally we could add N to the worklist, but this causes exponential + // compile time costs in large DAGs, e.g. Halide. + void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); } +}; + } // end anonymous namespace //===----------------------------------------------------------------------===// @@ -697,20 +772,23 @@ void DAGCombiner::deleteAndRecombine(SDNode *N) { static char isNegatibleForFree(SDValue Op, bool LegalOperations, const TargetLowering &TLI, const TargetOptions *Options, + bool ForCodeSize, unsigned Depth = 0) { // fneg is removable even if it has multiple uses. - if (Op.getOpcode() == ISD::FNEG) return 2; + if (Op.getOpcode() == ISD::FNEG) + return 2; // Don't allow anything with multiple uses unless we know it is free. EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - if (!Op.hasOneUse()) - if (!(Op.getOpcode() == ISD::FP_EXTEND && - TLI.isFPExtFree(VT, Op.getOperand(0).getValueType()))) - return 0; + if (!Op.hasOneUse() && + !(Op.getOpcode() == ISD::FP_EXTEND && + TLI.isFPExtFree(VT, Op.getOperand(0).getValueType()))) + return 0; // Don't recurse exponentially. - if (Depth > 6) return 0; + if (Depth > 6) + return 0; switch (Op.getOpcode()) { default: return false; @@ -721,7 +799,25 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations, // Don't invert constant FP values after legalization unless the target says // the negated constant is legal. return TLI.isOperationLegal(ISD::ConstantFP, VT) || - TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT); + TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT, + ForCodeSize); + } + case ISD::BUILD_VECTOR: { + // Only permit BUILD_VECTOR of constants. + if (llvm::any_of(Op->op_values(), [&](SDValue N) { + return !N.isUndef() && !isa<ConstantFPSDNode>(N); + })) + return 0; + if (!LegalOperations) + return 1; + if (TLI.isOperationLegal(ISD::ConstantFP, VT) && + TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) + return 1; + return llvm::all_of(Op->op_values(), [&](SDValue N) { + return N.isUndef() || + TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(N)->getValueAPF()), VT, + ForCodeSize); + }); } case ISD::FADD: if (!Options->UnsafeFPMath && !Flags.hasNoSignedZeros()) @@ -733,15 +829,14 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations, // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, - Options, Depth + 1)) + Options, ForCodeSize, Depth + 1)) return V; // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options, - Depth + 1); + ForCodeSize, Depth + 1); case ISD::FSUB: // We can't turn -(A-B) into B-A when we honor signed zeros. - if (!Options->NoSignedZerosFPMath && - !Flags.hasNoSignedZeros()) + if (!Options->NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) return 0; // fold (fneg (fsub A, B)) -> (fsub B, A) @@ -751,30 +846,31 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations, case ISD::FDIV: // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y)) if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, - Options, Depth + 1)) + Options, ForCodeSize, Depth + 1)) return V; return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options, - Depth + 1); + ForCodeSize, Depth + 1); case ISD::FP_EXTEND: case ISD::FP_ROUND: case ISD::FSIN: return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, - Depth + 1); + ForCodeSize, Depth + 1); } } /// If isNegatibleForFree returns true, return the newly negated expression. static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, - bool LegalOperations, unsigned Depth = 0) { - const TargetOptions &Options = DAG.getTarget().Options; + bool LegalOperations, bool ForCodeSize, + unsigned Depth = 0) { // fneg is removable even if it has multiple uses. - if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0); + if (Op.getOpcode() == ISD::FNEG) + return Op.getOperand(0); assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree"); - - const SDNodeFlags Flags = Op.getNode()->getFlags(); + const TargetOptions &Options = DAG.getTarget().Options; + const SDNodeFlags Flags = Op->getFlags(); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown code"); @@ -783,24 +879,41 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, V.changeSign(); return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType()); } + case ISD::BUILD_VECTOR: { + SmallVector<SDValue, 4> Ops; + for (SDValue C : Op->op_values()) { + if (C.isUndef()) { + Ops.push_back(C); + continue; + } + APFloat V = cast<ConstantFPSDNode>(C)->getValueAPF(); + V.changeSign(); + Ops.push_back(DAG.getConstantFP(V, SDLoc(Op), C.getValueType())); + } + return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Ops); + } case ISD::FADD: assert(Options.UnsafeFPMath || Flags.hasNoSignedZeros()); // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) if (isNegatibleForFree(Op.getOperand(0), LegalOperations, - DAG.getTargetLoweringInfo(), &Options, Depth+1)) + DAG.getTargetLoweringInfo(), &Options, ForCodeSize, + Depth + 1)) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, Depth+1), + LegalOperations, ForCodeSize, + Depth + 1), Op.getOperand(1), Flags); // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(1), DAG, - LegalOperations, Depth+1), + LegalOperations, ForCodeSize, + Depth + 1), Op.getOperand(0), Flags); case ISD::FSUB: // fold (fneg (fsub 0, B)) -> B - if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0))) + if (ConstantFPSDNode *N0CFP = + isConstOrConstSplatFP(Op.getOperand(0), /*AllowUndefs*/ true)) if (N0CFP->isZero()) return Op.getOperand(1); @@ -812,28 +925,33 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, case ISD::FDIV: // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) if (isNegatibleForFree(Op.getOperand(0), LegalOperations, - DAG.getTargetLoweringInfo(), &Options, Depth+1)) + DAG.getTargetLoweringInfo(), &Options, ForCodeSize, + Depth + 1)) return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, Depth+1), + LegalOperations, ForCodeSize, + Depth + 1), Op.getOperand(1), Flags); // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y)) return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0), GetNegatedExpression(Op.getOperand(1), DAG, - LegalOperations, Depth+1), Flags); + LegalOperations, ForCodeSize, + Depth + 1), Flags); case ISD::FP_EXTEND: case ISD::FSIN: return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, Depth+1)); + LegalOperations, ForCodeSize, + Depth + 1)); case ISD::FP_ROUND: - return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(), - GetNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, Depth+1), - Op.getOperand(1)); + return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(), + GetNegatedExpression(Op.getOperand(0), DAG, + LegalOperations, ForCodeSize, + Depth + 1), + Op.getOperand(1)); } } @@ -924,53 +1042,113 @@ static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) { ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()); } -SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, - SDValue N1, SDNodeFlags Flags) { - // Don't reassociate reductions. - if (Flags.hasVectorReduction()) - return SDValue(); +bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, + const SDLoc &DL, + SDValue N0, + SDValue N1) { + // Currently this only tries to ensure we don't undo the GEP splits done by + // CodeGenPrepare when shouldConsiderGEPOffsetSplit is true. To ensure this, + // we check if the following transformation would be problematic: + // (load/store (add, (add, x, offset1), offset2)) -> + // (load/store (add, x, offset1+offset2)). - EVT VT = N0.getValueType(); - if (N0.getOpcode() == Opc && !N0->getFlags().hasVectorReduction()) { - if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { - if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) { - // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2)) - if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, L, R)) - return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); - return SDValue(); - } - if (N0.hasOneUse()) { - // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one - // use - SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); - if (!OpNode.getNode()) - return SDValue(); - AddToWorklist(OpNode.getNode()); - return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); - } + if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD) + return false; + + if (N0.hasOneUse()) + return false; + + auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + auto *C2 = dyn_cast<ConstantSDNode>(N1); + if (!C1 || !C2) + return false; + + const APInt &C1APIntVal = C1->getAPIntValue(); + const APInt &C2APIntVal = C2->getAPIntValue(); + if (C1APIntVal.getBitWidth() > 64 || C2APIntVal.getBitWidth() > 64) + return false; + + const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal; + if (CombinedValueIntVal.getBitWidth() > 64) + return false; + const int64_t CombinedValue = CombinedValueIntVal.getSExtValue(); + + for (SDNode *Node : N0->uses()) { + auto LoadStore = dyn_cast<MemSDNode>(Node); + if (LoadStore) { + // Is x[offset2] already not a legal addressing mode? If so then + // reassociating the constants breaks nothing (we test offset2 because + // that's the one we hope to fold into the load or store). + TargetLoweringBase::AddrMode AM; + AM.HasBaseReg = true; + AM.BaseOffs = C2APIntVal.getSExtValue(); + EVT VT = LoadStore->getMemoryVT(); + unsigned AS = LoadStore->getAddressSpace(); + Type *AccessTy = VT.getTypeForEVT(*DAG.getContext()); + if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS)) + continue; + + // Would x[offset1+offset2] still be a legal addressing mode? + AM.BaseOffs = CombinedValue; + if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS)) + return true; } } - if (N1.getOpcode() == Opc && !N1->getFlags().hasVectorReduction()) { - if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) { - if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) { - // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2)) - if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, R, L)) - return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode); + return false; +} + +// Helper for DAGCombiner::reassociateOps. Try to reassociate an expression +// such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc. +SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, + SDValue N0, SDValue N1) { + EVT VT = N0.getValueType(); + + if (N0.getOpcode() != Opc) + return SDValue(); + + // Don't reassociate reductions. + if (N0->getFlags().hasVectorReduction()) + return SDValue(); + + if (SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { + if (SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N1)) { + // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) + if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, C1, C2)) + return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); + return SDValue(); + } + if (N0.hasOneUse()) { + // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) + // iff (op x, c1) has one use + SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); + if (!OpNode.getNode()) return SDValue(); - } - if (N1.hasOneUse()) { - // reassoc. (op x, (op y, c1)) -> (op (op x, y), c1) iff x+c1 has one - // use - SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0, N1.getOperand(0)); - if (!OpNode.getNode()) - return SDValue(); - AddToWorklist(OpNode.getNode()); - return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1)); - } + AddToWorklist(OpNode.getNode()); + return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); } } + return SDValue(); +} +// Try to reassociate commutative binops. +SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, + SDValue N1, SDNodeFlags Flags) { + assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative."); + // Don't reassociate reductions. + if (Flags.hasVectorReduction()) + return SDValue(); + + // Floating-point reassociation is not allowed without loose FP math. + if (N0.getValueType().isFloatingPoint() || + N1.getValueType().isFloatingPoint()) + if (!Flags.hasAllowReassociation() || !Flags.hasNoSignedZeros()) + return SDValue(); + + if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1)) + return Combined; + if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0)) + return Combined; return SDValue(); } @@ -1026,10 +1204,11 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { /// Check the specified integer node value to see if it can be simplified or if /// things it uses can be simplified by bit propagation. If so, return true. -bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { +bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts) { TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); KnownBits Known; - if (!TLI.SimplifyDemandedBits(Op, Demanded, Known, TLO)) + if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO)) return false; // Revisit the node. @@ -1048,12 +1227,13 @@ bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { /// Check the specified vector node value to see if it can be simplified or /// if things it uses can be simplified as it only uses some of the elements. /// If so, return true. -bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded, +bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op, + const APInt &DemandedElts, bool AssumeSingleUse) { TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); APInt KnownUndef, KnownZero; - if (!TLI.SimplifyDemandedVectorElts(Op, Demanded, KnownUndef, KnownZero, TLO, - 0, AssumeSingleUse)) + if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, + TLO, 0, AssumeSingleUse)) return false; // Revisit the node. @@ -1383,6 +1563,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) { LegalOperations = Level >= AfterLegalizeVectorOps; LegalTypes = Level >= AfterLegalizeTypes; + WorklistInserter AddNodes(*this); + // Add all the dag nodes to the worklist. for (SDNode &Node : DAG.allnodes()) AddToWorklist(&Node); @@ -1392,19 +1574,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) { // changes of the root. HandleSDNode Dummy(DAG.getRoot()); - // While the worklist isn't empty, find a node and try to combine it. - while (!WorklistMap.empty()) { - SDNode *N; - // The Worklist holds the SDNodes in order, but it may contain null entries. - do { - N = Worklist.pop_back_val(); - } while (!N); - - bool GoodWorklistEntry = WorklistMap.erase(N); - (void)GoodWorklistEntry; - assert(GoodWorklistEntry && - "Found a worklist entry without a corresponding map entry!"); - + // While we have a valid worklist entry node, try to combine it. + while (SDNode *N = getNextWorklistEntry()) { // If N has no uses, it is dead. Make sure to revisit all N's operands once // N is deleted from the DAG, since they too may now be dead or may have a // reduced number of uses, allowing other xforms. @@ -1493,9 +1664,11 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SSUBSAT: case ISD::USUBSAT: return visitSUBSAT(N); case ISD::ADDC: return visitADDC(N); - case ISD::UADDO: return visitUADDO(N); + case ISD::SADDO: + case ISD::UADDO: return visitADDO(N); case ISD::SUBC: return visitSUBC(N); - case ISD::USUBO: return visitUSUBO(N); + case ISD::SSUBO: + case ISD::USUBO: return visitSUBO(N); case ISD::ADDE: return visitADDE(N); case ISD::ADDCARRY: return visitADDCARRY(N); case ISD::SUBE: return visitSUBE(N); @@ -1509,8 +1682,8 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::MULHS: return visitMULHS(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); - case ISD::SMULO: return visitSMULO(N); - case ISD::UMULO: return visitUMULO(N); + case ISD::SMULO: + case ISD::UMULO: return visitMULO(N); case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: @@ -1590,8 +1763,22 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::MLOAD: return visitMLOAD(N); case ISD::MSCATTER: return visitMSCATTER(N); case ISD::MSTORE: return visitMSTORE(N); + case ISD::LIFETIME_END: return visitLIFETIME_END(N); case ISD::FP_TO_FP16: return visitFP_TO_FP16(N); case ISD::FP16_TO_FP: return visitFP16_TO_FP(N); + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N); } return SDValue(); } @@ -1644,7 +1831,7 @@ SDValue DAGCombiner::combine(SDNode *N) { } } - // If N is a commutative binary node, try eliminate it if the commuted + // If N is a commutative binary node, try to eliminate it if the commuted // version is already present in the DAG. if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) && N->getNumValues() == 1) { @@ -1693,6 +1880,12 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { if (OptLevel == CodeGenOpt::None) return SDValue(); + // If the sole user is a token factor, we should make sure we have a + // chance to merge them together. This prevents TF chains from inhibiting + // optimizations. + if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor) + AddToWorklist(*(N->use_begin())); + SmallVector<SDNode *, 8> TFs; // List of token factors to visit. SmallVector<SDValue, 8> Ops; // Ops for replacing token factor. SmallPtrSet<SDNode*, 16> SeenOps; @@ -1704,8 +1897,19 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { // Iterate through token factors. The TFs grows when new token factors are // encountered. for (unsigned i = 0; i < TFs.size(); ++i) { - SDNode *TF = TFs[i]; + // Limit number of nodes to inline, to avoid quadratic compile times. + // We have to add the outstanding Token Factors to Ops, otherwise we might + // drop Ops from the resulting Token Factors. + if (Ops.size() > TokenFactorInlineLimit) { + for (unsigned j = i; j < TFs.size(); j++) + Ops.emplace_back(TFs[j], 0); + // Drop unprocessed Token Factors from TFs, so we do not add them to the + // combiner worklist later. + TFs.resize(i); + break; + } + SDNode *TF = TFs[i]; // Check each of the operands. for (const SDValue &Op : TF->op_values()) { switch (Op.getOpcode()) { @@ -1719,8 +1923,6 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) { // Queue up for processing. TFs.push_back(Op.getNode()); - // Clean up in case the token factor is removed. - AddToWorklist(Op.getNode()); Changed = true; break; } @@ -1737,6 +1939,11 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { } } + // Re-visit inlined Token Factors, to clean them up in case they have been + // removed. Skip the first Token Factor, as this is the current node. + for (unsigned i = 1, e = TFs.size(); i < e; i++) + AddToWorklist(TFs[i]); + // Remove Nodes that are chained to another node in the list. Do so // by walking up chains breath-first stopping when we've seen // another operand. In general we must climb to the EntryNode, but we can exit @@ -1803,6 +2010,8 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { for (const SDValue &Op : CurNode->op_values()) AddToWorklist(i, Op.getNode(), CurOpNumber); break; + case ISD::LIFETIME_START: + case ISD::LIFETIME_END: case ISD::CopyFromReg: case ISD::CopyToReg: AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber); @@ -1831,9 +2040,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { if (SeenChains.count(Op.getNode()) == 0) PrunedOps.push_back(Op); } - Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps); + Result = DAG.getTokenFactor(SDLoc(N), PrunedOps); } else { - Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); + Result = DAG.getTokenFactor(SDLoc(N), Ops); } } return Result; @@ -1869,7 +2078,8 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { } SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { - assert(ISD::isBinaryOp(BO) && "Unexpected binary operator"); + assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 && + "Unexpected binary operator"); // Don't do this unless the old select is going away. We want to eliminate the // binary operator, not replace a binop with a select. @@ -1940,7 +2150,9 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { !isConstantFPBuildVectorOrConstantFP(NewCF)) return SDValue(); - return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); + SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); + SelectOp->setFlags(BO->getFlags()); + return SelectOp; } static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) { @@ -1990,6 +2202,7 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) { // We need a constant operand for the add/sub, and the other operand is a // logical shift right: add (srl), C or sub C, (srl). + // TODO - support non-uniform vector amounts. bool IsAdd = N->getOpcode() == ISD::ADD; SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0); SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1); @@ -2006,7 +2219,7 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) { EVT VT = ShiftOp.getValueType(); SDValue ShAmt = ShiftOp.getOperand(1); ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt); - if (!ShAmtC || ShAmtC->getZExtValue() != VT.getScalarSizeInBits() - 1) + if (!ShAmtC || ShAmtC->getAPIntValue() != (VT.getScalarSizeInBits() - 1)) return SDValue(); // Eliminate the 'not' by adjusting the shift and add/sub constant: @@ -2019,7 +2232,10 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::ADD, DL, VT, NewShift, DAG.getConstant(NewC, DL, VT)); } -SDValue DAGCombiner::visitADD(SDNode *N) { +/// Try to fold a node that behaves like an ADD (note that N isn't necessarily +/// an ISD::ADD here, it could for example be an ISD::OR if we know that there +/// are no common bits set in the operands). +SDValue DAGCombiner::visitADDLike(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); @@ -2058,13 +2274,22 @@ SDValue DAGCombiner::visitADD(SDNode *N) { return N0; if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) { + // fold ((A-c1)+c2) -> (A+(c2-c1)) + if (N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) { + SDValue Sub = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N1.getNode(), + N0.getOperand(1).getNode()); + assert(Sub && "Constant folding failed"); + return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub); + } + // fold ((c1-A)+c2) -> (c1+c2)-A if (N0.getOpcode() == ISD::SUB && isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) { - // FIXME: Adding 2 constants should be handled by FoldConstantArithmetic. - return DAG.getNode(ISD::SUB, DL, VT, - DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), - N0.getOperand(1)); + SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, N1.getNode(), + N0.getOperand(0).getNode()); + assert(Add && "Constant folding failed"); + return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1)); } // add (sext i1 X), 1 -> zext (not i1 X) @@ -2097,9 +2322,10 @@ SDValue DAGCombiner::visitADD(SDNode *N) { return NewSel; // reassociate add - if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1, N->getFlags())) - return RADD; - + if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N0, N1)) { + if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags())) + return RADD; + } // fold ((0-A) + B) -> B-A if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0))) return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); @@ -2116,6 +2342,18 @@ SDValue DAGCombiner::visitADD(SDNode *N) { if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1)) return N0.getOperand(0); + // fold ((A-B)+(C-A)) -> (C-B) + if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB && + N0.getOperand(0) == N1.getOperand(1)) + return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), + N0.getOperand(1)); + + // fold ((A-B)+(B-C)) -> (A-C) + if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB && + N0.getOperand(1) == N1.getOperand(0)) + return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), + N1.getOperand(1)); + // fold (A+(B-(A+C))) to (B-C) if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && N0 == N1.getOperand(1).getOperand(0)) @@ -2148,31 +2386,93 @@ SDValue DAGCombiner::visitADD(SDNode *N) { DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11)); } + // fold (add (umax X, C), -C) --> (usubsat X, C) + if (N0.getOpcode() == ISD::UMAX && hasOperation(ISD::USUBSAT, VT)) { + auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) { + return (!Max && !Op) || + (Max && Op && Max->getAPIntValue() == (-Op->getAPIntValue())); + }; + if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchUSUBSAT, + /*AllowUndefs*/ true)) + return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), + N0.getOperand(1)); + } + + if (SimplifyDemandedBits(SDValue(N, 0))) + return SDValue(N, 0); + + if (isOneOrOneSplat(N1)) { + // fold (add (xor a, -1), 1) -> (sub 0, a) + if (isBitwiseNot(N0)) + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + N0.getOperand(0)); + + // fold (add (add (xor a, -1), b), 1) -> (sub b, a) + if (N0.getOpcode() == ISD::ADD || + N0.getOpcode() == ISD::UADDO || + N0.getOpcode() == ISD::SADDO) { + SDValue A, Xor; + + if (isBitwiseNot(N0.getOperand(0))) { + A = N0.getOperand(1); + Xor = N0.getOperand(0); + } else if (isBitwiseNot(N0.getOperand(1))) { + A = N0.getOperand(0); + Xor = N0.getOperand(1); + } + + if (Xor) + return DAG.getNode(ISD::SUB, DL, VT, A, Xor.getOperand(0)); + } + + // Look for: + // add (add x, y), 1 + // And if the target does not like this form then turn into: + // sub y, (xor x, -1) + if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() && + N0.getOpcode() == ISD::ADD) { + SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), + DAG.getAllOnesConstant(DL, VT)); + return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not); + } + } + + // (x - y) + -1 -> add (xor y, -1), x + if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && + isAllOnesOrAllOnesSplat(N1)) { + SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1); + return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0)); + } + + if (SDValue Combined = visitADDLikeCommutative(N0, N1, N)) + return Combined; + + if (SDValue Combined = visitADDLikeCommutative(N1, N0, N)) + return Combined; + + return SDValue(); +} + +SDValue DAGCombiner::visitADD(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + SDLoc DL(N); + + if (SDValue Combined = visitADDLike(N)) + return Combined; + if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG)) return V; if (SDValue V = foldAddSubOfSignBit(N, DAG)) return V; - if (SimplifyDemandedBits(SDValue(N, 0))) - return SDValue(N, 0); - // fold (a+b) -> (a|b) iff a and b share no bits. if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && DAG.haveNoCommonBitsSet(N0, N1)) return DAG.getNode(ISD::OR, DL, VT, N0, N1); - // fold (add (xor a, -1), 1) -> (sub 0, a) - if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) - return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - N0.getOperand(0)); - - if (SDValue Combined = visitADDLike(N0, N1, N)) - return Combined; - - if (SDValue Combined = visitADDLike(N1, N0, N)) - return Combined; - return SDValue(); } @@ -2246,6 +2546,10 @@ static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO) return SDValue(); + EVT VT = V.getNode()->getValueType(0); + if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT)) + return SDValue(); + // If the result is masked, then no matter what kind of bool it is we can // return. If it isn't, then we need to make sure the bool type is either 0 or // 1 and not other values. @@ -2257,7 +2561,26 @@ static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { return SDValue(); } -SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) { +/// Given the operands of an add/sub operation, see if the 2nd operand is a +/// masked 0/1 whose source operand is actually known to be 0/-1. If so, invert +/// the opcode and bypass the mask operation. +static SDValue foldAddSubMasked1(bool IsAdd, SDValue N0, SDValue N1, + SelectionDAG &DAG, const SDLoc &DL) { + if (N1.getOpcode() != ISD::AND || !isOneOrOneSplat(N1->getOperand(1))) + return SDValue(); + + EVT VT = N0.getValueType(); + if (DAG.ComputeNumSignBits(N1.getOperand(0)) != VT.getScalarSizeInBits()) + return SDValue(); + + // add N0, (and (AssertSext X, i1), 1) --> sub N0, X + // sub N0, (and (AssertSext X, i1), 1) --> add N0, X + return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, N0, N1.getOperand(0)); +} + +/// Helper for doing combines based on N0 and N1 being added to each other. +SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1, + SDNode *LocReference) { EVT VT = N0.getValueType(); SDLoc DL(LocReference); @@ -2269,21 +2592,42 @@ SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) N1.getOperand(0).getOperand(1), N1.getOperand(1))); - if (N1.getOpcode() == ISD::AND) { - SDValue AndOp0 = N1.getOperand(0); - unsigned NumSignBits = DAG.ComputeNumSignBits(AndOp0); - unsigned DestBits = VT.getScalarSizeInBits(); - - // (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x)) - // and similar xforms where the inner op is either ~0 or 0. - if (NumSignBits == DestBits && isOneOrOneSplat(N1->getOperand(1))) - return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0); - } + if (SDValue V = foldAddSubMasked1(true, N0, N1, DAG, DL)) + return V; - // add (sext i1), X -> sub X, (zext i1) + // Look for: + // add (add x, 1), y + // And if the target does not like this form then turn into: + // sub y, (xor x, -1) + if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() && + N0.getOpcode() == ISD::ADD && isOneOrOneSplat(N0.getOperand(1))) { + SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), + DAG.getAllOnesConstant(DL, VT)); + return DAG.getNode(ISD::SUB, DL, VT, N1, Not); + } + + // Hoist one-use subtraction by non-opaque constant: + // (x - C) + y -> (x + y) - C + // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors. + if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1); + return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1)); + } + // Hoist one-use subtraction from non-opaque constant: + // (C - x) + y -> (y - x) + C + if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) { + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); + return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0)); + } + + // If the target's bool is represented as 0/1, prefer to make this 'sub 0/1' + // rather than 'add 0/-1' (the zext should get folded). + // add (sext i1 Y), X --> sub X, (zext i1 Y) if (N0.getOpcode() == ISD::SIGN_EXTEND && - N0.getOperand(0).getValueType() == MVT::i1 && - !TLI.isOperationLegal(ISD::SIGN_EXTEND, MVT::i1)) { + N0.getOperand(0).getScalarValueSizeInBits() == 1 && + TLI.getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent) { SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt); } @@ -2344,8 +2688,10 @@ SDValue DAGCombiner::visitADDC(SDNode *N) { return SDValue(); } -static SDValue flipBoolean(SDValue V, const SDLoc &DL, EVT VT, +static SDValue flipBoolean(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const TargetLowering &TLI) { + EVT VT = V.getValueType(); + SDValue Cst; switch (TLI.getBooleanContents(VT)) { case TargetLowering::ZeroOrOneBooleanContent: @@ -2353,35 +2699,60 @@ static SDValue flipBoolean(SDValue V, const SDLoc &DL, EVT VT, Cst = DAG.getConstant(1, DL, VT); break; case TargetLowering::ZeroOrNegativeOneBooleanContent: - Cst = DAG.getConstant(-1, DL, VT); + Cst = DAG.getAllOnesConstant(DL, VT); break; } return DAG.getNode(ISD::XOR, DL, VT, V, Cst); } -static bool isBooleanFlip(SDValue V, EVT VT, const TargetLowering &TLI) { - if (V.getOpcode() != ISD::XOR) return false; - ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V.getOperand(1)); - if (!Const) return false; +/** + * Flips a boolean if it is cheaper to compute. If the Force parameters is set, + * then the flip also occurs if computing the inverse is the same cost. + * This function returns an empty SDValue in case it cannot flip the boolean + * without increasing the cost of the computation. If you want to flip a boolean + * no matter what, use flipBoolean. + */ +static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG, + const TargetLowering &TLI, + bool Force) { + if (Force && isa<ConstantSDNode>(V)) + return flipBoolean(V, SDLoc(V), DAG, TLI); + + if (V.getOpcode() != ISD::XOR) + return SDValue(); + + ConstantSDNode *Const = isConstOrConstSplat(V.getOperand(1), false); + if (!Const) + return SDValue(); + EVT VT = V.getValueType(); + + bool IsFlip = false; switch(TLI.getBooleanContents(VT)) { case TargetLowering::ZeroOrOneBooleanContent: - return Const->isOne(); + IsFlip = Const->isOne(); + break; case TargetLowering::ZeroOrNegativeOneBooleanContent: - return Const->isAllOnesValue(); + IsFlip = Const->isAllOnesValue(); + break; case TargetLowering::UndefinedBooleanContent: - return (Const->getAPIntValue() & 0x01) == 1; + IsFlip = (Const->getAPIntValue() & 0x01) == 1; + break; } - llvm_unreachable("Unsupported boolean content"); + + if (IsFlip) + return V.getOperand(0); + if (Force) + return flipBoolean(V, SDLoc(V), DAG, TLI); + return SDValue(); } -SDValue DAGCombiner::visitUADDO(SDNode *N) { +SDValue DAGCombiner::visitADDO(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); - if (VT.isVector()) - return SDValue(); + bool IsSigned = (ISD::SADDO == N->getOpcode()); EVT CarryVT = N->getValueType(1); SDLoc DL(N); @@ -2392,40 +2763,42 @@ SDValue DAGCombiner::visitUADDO(SDNode *N) { DAG.getUNDEF(CarryVT)); // canonicalize constant to RHS. - ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); - if (N0C && !N1C) - return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0); + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0); - // fold (uaddo x, 0) -> x + no carry out - if (isNullConstant(N1)) + // fold (addo x, 0) -> x + no carry out + if (isNullOrNullSplat(N1)) return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); - // If it cannot overflow, transform into an add. - if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) - return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), - DAG.getConstant(0, DL, CarryVT)); + if (!IsSigned) { + // If it cannot overflow, transform into an add. + if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getConstant(0, DL, CarryVT)); - // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry. - if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) { - SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(), - DAG.getConstant(0, DL, VT), - N0.getOperand(0)); - return CombineTo(N, Sub, - flipBoolean(Sub.getValue(1), DL, CarryVT, DAG, TLI)); - } + // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry. + if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) { + SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(), + DAG.getConstant(0, DL, VT), N0.getOperand(0)); + return CombineTo(N, Sub, + flipBoolean(Sub.getValue(1), DL, DAG, TLI)); + } - if (SDValue Combined = visitUADDOLike(N0, N1, N)) - return Combined; + if (SDValue Combined = visitUADDOLike(N0, N1, N)) + return Combined; - if (SDValue Combined = visitUADDOLike(N1, N0, N)) - return Combined; + if (SDValue Combined = visitUADDOLike(N1, N0, N)) + return Combined; + } return SDValue(); } SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { - auto VT = N0.getValueType(); + EVT VT = N0.getValueType(); + if (VT.isVector()) + return SDValue(); // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) // If Y + 1 cannot overflow. @@ -2484,11 +2857,10 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) { return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); } - EVT CarryVT = CarryIn.getValueType(); - // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. if (isNullConstant(N0) && isNullConstant(N1)) { EVT VT = N0.getValueType(); + EVT CarryVT = CarryIn.getValueType(); SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT); AddToWorklist(CarryExt.getNode()); return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt, @@ -2496,16 +2868,6 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) { DAG.getConstant(0, DL, CarryVT)); } - // fold (addcarry (xor a, -1), 0, !b) -> (subcarry 0, a, b) and flip carry. - if (isBitwiseNot(N0) && isNullConstant(N1) && - isBooleanFlip(CarryIn, CarryVT, TLI)) { - SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), - DAG.getConstant(0, DL, N0.getValueType()), - N0.getOperand(0), CarryIn.getOperand(0)); - return CombineTo(N, Sub, - flipBoolean(Sub.getValue(1), DL, CarryVT, DAG, TLI)); - } - if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N)) return Combined; @@ -2515,12 +2877,112 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) { return SDValue(); } +/** + * If we are facing some sort of diamond carry propapagtion pattern try to + * break it up to generate something like: + * (addcarry X, 0, (addcarry A, B, Z):Carry) + * + * The end result is usually an increase in operation required, but because the + * carry is now linearized, other tranforms can kick in and optimize the DAG. + * + * Patterns typically look something like + * (uaddo A, B) + * / \ + * Carry Sum + * | \ + * | (addcarry *, 0, Z) + * | / + * \ Carry + * | / + * (addcarry X, *, *) + * + * But numerous variation exist. Our goal is to identify A, B, X and Z and + * produce a combine with a single path for carry propagation. + */ +static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, + SDValue X, SDValue Carry0, SDValue Carry1, + SDNode *N) { + if (Carry1.getResNo() != 1 || Carry0.getResNo() != 1) + return SDValue(); + if (Carry1.getOpcode() != ISD::UADDO) + return SDValue(); + + SDValue Z; + + /** + * First look for a suitable Z. It will present itself in the form of + * (addcarry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true + */ + if (Carry0.getOpcode() == ISD::ADDCARRY && + isNullConstant(Carry0.getOperand(1))) { + Z = Carry0.getOperand(2); + } else if (Carry0.getOpcode() == ISD::UADDO && + isOneConstant(Carry0.getOperand(1))) { + EVT VT = Combiner.getSetCCResultType(Carry0.getValueType()); + Z = DAG.getConstant(1, SDLoc(Carry0.getOperand(1)), VT); + } else { + // We couldn't find a suitable Z. + return SDValue(); + } + + + auto cancelDiamond = [&](SDValue A,SDValue B) { + SDLoc DL(N); + SDValue NewY = DAG.getNode(ISD::ADDCARRY, DL, Carry0->getVTList(), A, B, Z); + Combiner.AddToWorklist(NewY.getNode()); + return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), X, + DAG.getConstant(0, DL, X.getValueType()), + NewY.getValue(1)); + }; + + /** + * (uaddo A, B) + * | + * Sum + * | + * (addcarry *, 0, Z) + */ + if (Carry0.getOperand(0) == Carry1.getValue(0)) { + return cancelDiamond(Carry1.getOperand(0), Carry1.getOperand(1)); + } + + /** + * (addcarry A, 0, Z) + * | + * Sum + * | + * (uaddo *, B) + */ + if (Carry1.getOperand(0) == Carry0.getValue(0)) { + return cancelDiamond(Carry0.getOperand(0), Carry1.getOperand(1)); + } + + if (Carry1.getOperand(1) == Carry0.getValue(0)) { + return cancelDiamond(Carry1.getOperand(0), Carry0.getOperand(0)); + } + + return SDValue(); +} + SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N) { + // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry. + if (isBitwiseNot(N0)) + if (SDValue NotC = extractBooleanFlip(CarryIn, DAG, TLI, true)) { + SDLoc DL(N); + SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1, + N0.getOperand(0), NotC); + return CombineTo(N, Sub, + flipBoolean(Sub.getValue(1), DL, DAG, TLI)); + } + // Iff the flag result is dead: // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry) + // Don't do this if the Carry comes from the uaddo. It won't remove the uaddo + // or the dependency between the instructions. if ((N0.getOpcode() == ISD::ADD || - (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0)) && + (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0 && + N0.getValue(1) != CarryIn)) && isNullConstant(N1) && !N->hasAnyUseOfValue(1)) return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0.getOperand(0), N0.getOperand(1), CarryIn); @@ -2529,35 +2991,13 @@ SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, * When one of the addcarry argument is itself a carry, we may be facing * a diamond carry propagation. In which case we try to transform the DAG * to ensure linear carry propagation if that is possible. - * - * We are trying to get: - * (addcarry X, 0, (addcarry A, B, Z):Carry) */ if (auto Y = getAsCarry(TLI, N1)) { - /** - * (uaddo A, B) - * / \ - * Carry Sum - * | \ - * | (addcarry *, 0, Z) - * | / - * \ Carry - * | / - * (addcarry X, *, *) - */ - if (Y.getOpcode() == ISD::UADDO && - CarryIn.getResNo() == 1 && - CarryIn.getOpcode() == ISD::ADDCARRY && - isNullConstant(CarryIn.getOperand(1)) && - CarryIn.getOperand(0) == Y.getValue(0)) { - auto NewY = DAG.getNode(ISD::ADDCARRY, SDLoc(N), Y->getVTList(), - Y.getOperand(0), Y.getOperand(1), - CarryIn.getOperand(2)); - AddToWorklist(NewY.getNode()); - return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, - DAG.getConstant(0, SDLoc(N), N0.getValueType()), - NewY.getValue(1)); - } + // Because both are carries, Y and Z can be swapped. + if (auto R = combineADDCARRYDiamond(*this, DAG, N0, Y, CarryIn, N)) + return R; + if (auto R = combineADDCARRYDiamond(*this, DAG, N0, CarryIn, Y, N)) + return R; } return SDValue(); @@ -2620,7 +3060,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { // -(X >>s 31) -> (X >>u 31) if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) { ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1)); - if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1) { + if (ShiftAmt && ShiftAmt->getAPIntValue() == (BitWidth - 1)) { auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA; if (!LegalOperations || TLI.isOperationLegal(NewSh, VT)) return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1)); @@ -2662,16 +3102,48 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1) return N0.getOperand(0); + // fold (A+C1)-C2 -> A+(C1-C2) + if (N0.getOpcode() == ISD::ADD && + isConstantOrConstantVector(N1, /* NoOpaques */ true) && + isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { + SDValue NewC = DAG.FoldConstantArithmetic( + ISD::SUB, DL, VT, N0.getOperand(1).getNode(), N1.getNode()); + assert(NewC && "Constant folding failed"); + return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC); + } + // fold C2-(A+C1) -> (C2-C1)-A if (N1.getOpcode() == ISD::ADD) { SDValue N11 = N1.getOperand(1); if (isConstantOrConstantVector(N0, /* NoOpaques */ true) && isConstantOrConstantVector(N11, /* NoOpaques */ true)) { - SDValue NewC = DAG.getNode(ISD::SUB, DL, VT, N0, N11); + SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N0.getNode(), + N11.getNode()); + assert(NewC && "Constant folding failed"); return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0)); } } + // fold (A-C1)-C2 -> A-(C1+C2) + if (N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N1, /* NoOpaques */ true) && + isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { + SDValue NewC = DAG.FoldConstantArithmetic( + ISD::ADD, DL, VT, N0.getOperand(1).getNode(), N1.getNode()); + assert(NewC && "Constant folding failed"); + return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC); + } + + // fold (c1-A)-c2 -> (c1-c2)-A + if (N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N1, /* NoOpaques */ true) && + isConstantOrConstantVector(N0.getOperand(0), /* NoOpaques */ true)) { + SDValue NewC = DAG.FoldConstantArithmetic( + ISD::SUB, DL, VT, N0.getOperand(0).getNode(), N1.getNode()); + assert(NewC && "Constant folding failed"); + return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1)); + } + // fold ((A+(B+or-C))-B) -> A+or-C if (N0.getOpcode() == ISD::ADD && (N0.getOperand(1).getOpcode() == ISD::SUB || @@ -2728,6 +3200,63 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { if (SDValue V = foldAddSubOfSignBit(N, DAG)) return V; + if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N))) + return V; + + // (x - y) - 1 -> add (xor y, -1), x + if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && isOneOrOneSplat(N1)) { + SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), + DAG.getAllOnesConstant(DL, VT)); + return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0)); + } + + // Look for: + // sub y, (xor x, -1) + // And if the target does not like this form then turn into: + // add (add x, y), 1 + if (TLI.preferIncOfAddToSubOfNot(VT) && N1.hasOneUse() && isBitwiseNot(N1)) { + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(0)); + return DAG.getNode(ISD::ADD, DL, VT, Add, DAG.getConstant(1, DL, VT)); + } + + // Hoist one-use addition by non-opaque constant: + // (x + C) - y -> (x - y) + C + if (N0.hasOneUse() && N0.getOpcode() == ISD::ADD && + isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); + return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1)); + } + // y - (x + C) -> (y - x) - C + if (N1.hasOneUse() && N1.getOpcode() == ISD::ADD && + isConstantOrConstantVector(N1.getOperand(1), /*NoOpaques=*/true)) { + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(0)); + return DAG.getNode(ISD::SUB, DL, VT, Sub, N1.getOperand(1)); + } + // (x - C) - y -> (x - y) - C + // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors. + if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); + return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1)); + } + // (C - x) - y -> C - (x + y) + if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) { + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1); + return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add); + } + + // If the target's bool is represented as 0/-1, prefer to make this 'add 0/-1' + // rather than 'sub 0/1' (the sext should get folded). + // sub X, (zext i1 Y) --> add X, (sext i1 Y) + if (N1.getOpcode() == ISD::ZERO_EXTEND && + N1.getOperand(0).getScalarValueSizeInBits() == 1 && + TLI.getBooleanContents(VT) == + TargetLowering::ZeroOrNegativeOneBooleanContent) { + SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N1.getOperand(0)); + return DAG.getNode(ISD::ADD, DL, VT, N0, SExt); + } + // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X) if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) { @@ -2772,7 +3301,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) { SDValue ShAmt = N1.getOperand(1); ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt); - if (ShAmtC && ShAmtC->getZExtValue() == N1.getScalarValueSizeInBits() - 1) { + if (ShAmtC && + ShAmtC->getAPIntValue() == (N1.getScalarValueSizeInBits() - 1)) { SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt); return DAG.getNode(ISD::ADD, DL, VT, N0, SRA); } @@ -2846,12 +3376,11 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) { return SDValue(); } -SDValue DAGCombiner::visitUSUBO(SDNode *N) { +SDValue DAGCombiner::visitSUBO(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); - if (VT.isVector()) - return SDValue(); + bool IsSigned = (ISD::SSUBO == N->getOpcode()); EVT CarryVT = N->getValueType(1); SDLoc DL(N); @@ -2861,17 +3390,25 @@ SDValue DAGCombiner::visitUSUBO(SDNode *N) { return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), DAG.getUNDEF(CarryVT)); - // fold (usubo x, x) -> 0 + no borrow + // fold (subo x, x) -> 0 + no borrow if (N0 == N1) return CombineTo(N, DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, CarryVT)); - // fold (usubo x, 0) -> x + no borrow - if (isNullConstant(N1)) + ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); + + // fold (subox, c) -> (addo x, -c) + if (IsSigned && N1C && !N1C->getAPIntValue().isMinSignedValue()) { + return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, + DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); + } + + // fold (subo x, 0) -> x + no borrow + if (isNullOrNullSplat(N1)) return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow - if (isAllOnesConstant(N0)) + if (!IsSigned && isAllOnesOrAllOnesSplat(N0)) return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), DAG.getConstant(0, DL, CarryVT)); @@ -3012,13 +3549,13 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { MathOp = ISD::SUB; if (MathOp != ISD::DELETED_NODE) { - unsigned ShAmt = MathOp == ISD::ADD ? (MulC - 1).logBase2() - : (MulC + 1).logBase2(); - assert(ShAmt > 0 && ShAmt < VT.getScalarSizeInBits() && - "Not expecting multiply-by-constant that could have simplified"); + unsigned ShAmt = + MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2(); + assert(ShAmt < VT.getScalarSizeInBits() && + "multiply-by-constant generated out of bounds shift"); SDLoc DL(N); - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, N0, - DAG.getConstant(ShAmt, DL, VT)); + SDValue Shl = + DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT)); SDValue R = DAG.getNode(MathOp, DL, VT, Shl, N0); if (ConstValue1.isNegative()) R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R); @@ -3069,7 +3606,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { N0.getOperand(1), N1)); // reassociate mul - if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) + if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) return RMUL; return SDValue(); @@ -3612,7 +4149,6 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) { // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c) if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) { - SDLoc DL(N); unsigned NumEltBits = VT.getScalarSizeInBits(); SDValue LogBase2 = BuildLogBase2(N1, DL); SDValue SRLAmt = DAG.getNode( @@ -3753,22 +4289,14 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { return SDValue(); } -SDValue DAGCombiner::visitSMULO(SDNode *N) { - // (smulo x, 2) -> (saddo x, x) - if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) - if (C2->getAPIntValue() == 2) - return DAG.getNode(ISD::SADDO, SDLoc(N), N->getVTList(), - N->getOperand(0), N->getOperand(0)); - - return SDValue(); -} +SDValue DAGCombiner::visitMULO(SDNode *N) { + bool IsSigned = (ISD::SMULO == N->getOpcode()); -SDValue DAGCombiner::visitUMULO(SDNode *N) { - // (umulo x, 2) -> (uaddo x, x) - if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) + // (mulo x, 2) -> (addo x, x) + if (ConstantSDNode *C2 = isConstOrConstSplat(N->getOperand(1))) if (C2->getAPIntValue() == 2) - return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(), - N->getOperand(0), N->getOperand(0)); + return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, SDLoc(N), + N->getVTList(), N->getOperand(0), N->getOperand(0)); return SDValue(); } @@ -4075,6 +4603,33 @@ SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, SDValue Zero = DAG.getConstant(0, DL, OpVT); return DAG.getSetCC(DL, VT, Or, Zero, CC1); } + + // Turn compare of constants whose difference is 1 bit into add+and+setcc. + // TODO - support non-uniform vector amounts. + if ((IsAnd && CC1 == ISD::SETNE) || (!IsAnd && CC1 == ISD::SETEQ)) { + // Match a shared variable operand and 2 non-opaque constant operands. + ConstantSDNode *C0 = isConstOrConstSplat(LR); + ConstantSDNode *C1 = isConstOrConstSplat(RR); + if (LL == RL && C0 && C1 && !C0->isOpaque() && !C1->isOpaque()) { + // Canonicalize larger constant as C0. + if (C1->getAPIntValue().ugt(C0->getAPIntValue())) + std::swap(C0, C1); + + // The difference of the constants must be a single bit. + const APInt &C0Val = C0->getAPIntValue(); + const APInt &C1Val = C1->getAPIntValue(); + if ((C0Val - C1Val).isPowerOf2()) { + // and/or (setcc X, C0, ne), (setcc X, C1, ne/eq) --> + // setcc ((add X, -C1), ~(C0 - C1)), 0, ne/eq + SDValue OffsetC = DAG.getConstant(-C1Val, DL, OpVT); + SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LL, OffsetC); + SDValue MaskC = DAG.getConstant(~(C0Val - C1Val), DL, OpVT); + SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Add, MaskC); + SDValue Zero = DAG.getConstant(0, DL, OpVT); + return DAG.getSetCC(DL, VT, And, Zero, CC0); + } + } + } } // Canonicalize equivalent operands to LL == RL. @@ -4259,7 +4814,8 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, // Ensure that this isn't going to produce an unsupported unaligned access. if (ShAmt && !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - LDST->getAddressSpace(), ShAmt / 8)) + LDST->getAddressSpace(), ShAmt / 8, + LDST->getMemOperand()->getFlags())) return false; // It's not possible to generate a constant of extended or untyped type. @@ -4316,9 +4872,7 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N, SDNode *&NodeToMask) { // Recursively search for the operands, looking for loads which can be // narrowed. - for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) { - SDValue Op = N->getOperand(i); - + for (SDValue Op : N->op_values()) { if (Op.getValueType().isVector()) return false; @@ -4480,7 +5034,7 @@ SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) { SDValue N1 = N->getOperand(1); // Do we actually prefer shifts over mask? - if (!TLI.preferShiftsToClearExtremeBits(N0)) + if (!TLI.shouldFoldMaskToVariableShiftPair(N0)) return SDValue(); // Try to match (-1 '[outer] logical shift' y) @@ -4575,7 +5129,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { return NewSel; // reassociate and - if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags())) + if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags())) return RAND; // Try to convert a constant mask AND into a shuffle clear mask. @@ -4644,24 +5198,22 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // the first vector value and FF for the rest, repeating. We need a mask // that will apply equally to all members of the vector, so AND all the // lanes of the constant together. - EVT VT = Vector->getValueType(0); - unsigned BitWidth = VT.getScalarSizeInBits(); + unsigned EltBitWidth = Vector->getValueType(0).getScalarSizeInBits(); // If the splat value has been compressed to a bitlength lower // than the size of the vector lane, we need to re-expand it to // the lane size. - if (BitWidth > SplatBitSize) - for (SplatValue = SplatValue.zextOrTrunc(BitWidth); - SplatBitSize < BitWidth; - SplatBitSize = SplatBitSize * 2) + if (EltBitWidth > SplatBitSize) + for (SplatValue = SplatValue.zextOrTrunc(EltBitWidth); + SplatBitSize < EltBitWidth; SplatBitSize = SplatBitSize * 2) SplatValue |= SplatValue.shl(SplatBitSize); // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value. - if (SplatBitSize % BitWidth == 0) { - Constant = APInt::getAllOnesValue(BitWidth); - for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i) - Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth); + if ((SplatBitSize % EltBitWidth) == 0) { + Constant = APInt::getAllOnesValue(EltBitWidth); + for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i) + Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth); } } } @@ -4773,44 +5325,29 @@ SDValue DAGCombiner::visitAND(SDNode *N) { return SDValue(N, 0); // fold (zext_inreg (extload x)) -> (zextload x) - if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) { - LoadSDNode *LN0 = cast<LoadSDNode>(N0); - EVT MemVT = LN0->getMemoryVT(); - // If we zero all the possible extended bits, then we can turn this into - // a zextload if we are running before legalize or the operation is legal. - unsigned BitWidth = N1.getScalarValueSizeInBits(); - if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth, - BitWidth - MemVT.getScalarSizeInBits())) && - ((!LegalOperations && !LN0->isVolatile()) || - TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { - SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, - LN0->getChain(), LN0->getBasePtr(), - MemVT, LN0->getMemOperand()); - AddToWorklist(N); - CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); - return SDValue(N, 0); // Return N so it doesn't get rechecked! - } - } // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use - if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && - N0.hasOneUse()) { + if (ISD::isUNINDEXEDLoad(N0.getNode()) && + (ISD::isEXTLoad(N0.getNode()) || + (ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); EVT MemVT = LN0->getMemoryVT(); // If we zero all the possible extended bits, then we can turn this into // a zextload if we are running before legalize or the operation is legal. - unsigned BitWidth = N1.getScalarValueSizeInBits(); - if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth, - BitWidth - MemVT.getScalarSizeInBits())) && + unsigned ExtBitSize = N1.getScalarValueSizeInBits(); + unsigned MemBitSize = MemVT.getScalarSizeInBits(); + APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize); + if (DAG.MaskedValueIsZero(N1, ExtBits) && ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { - SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, - LN0->getChain(), LN0->getBasePtr(), - MemVT, LN0->getMemOperand()); + SDValue ExtLoad = + DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), + LN0->getBasePtr(), MemVT, LN0->getMemOperand()); AddToWorklist(N); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + return SDValue(N, 0); // Return N so it doesn't get rechecked! } } + // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const) if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) { if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), @@ -5155,6 +5692,23 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) { return SDValue(); } +/// OR combines for which the commuted variant will be tried as well. +static SDValue visitORCommutative( + SelectionDAG &DAG, SDValue N0, SDValue N1, SDNode *N) { + EVT VT = N0.getValueType(); + if (N0.getOpcode() == ISD::AND) { + // fold (or (and X, (xor Y, -1)), Y) -> (or X, Y) + if (isBitwiseNot(N0.getOperand(1)) && N0.getOperand(1).getOperand(0) == N1) + return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(0), N1); + + // fold (or (and (xor Y, -1), X), Y) -> (or X, Y) + if (isBitwiseNot(N0.getOperand(0)) && N0.getOperand(0).getOperand(0) == N1) + return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(1), N1); + } + + return SDValue(); +} + SDValue DAGCombiner::visitOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -5284,7 +5838,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { return BSwap; // reassociate or - if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags())) + if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags())) return ROR; // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) @@ -5302,6 +5856,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) { } } + if (SDValue Combined = visitORCommutative(DAG, N0, N1, N)) + return Combined; + if (SDValue Combined = visitORCommutative(DAG, N1, N0, N)) + return Combined; + // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) if (N0.getOpcode() == N1.getOpcode()) if (SDValue V = hoistLogicOpWithSameOpcodeHands(N)) @@ -5318,6 +5877,12 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + // If OR can be rewritten into ADD, try combines based on ADD. + if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) && + DAG.haveNoCommonBitsSet(N0, N1)) + if (SDValue Combined = visitADDLike(N)) + return Combined; + return SDValue(); } @@ -5869,6 +6434,213 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, return None; } +static unsigned LittleEndianByteAt(unsigned BW, unsigned i) { + return i; +} + +static unsigned BigEndianByteAt(unsigned BW, unsigned i) { + return BW - i - 1; +} + +// Check if the bytes offsets we are looking at match with either big or +// little endian value loaded. Return true for big endian, false for little +// endian, and None if match failed. +static Optional<bool> isBigEndian(const SmallVector<int64_t, 4> &ByteOffsets, + int64_t FirstOffset) { + // The endian can be decided only when it is 2 bytes at least. + unsigned Width = ByteOffsets.size(); + if (Width < 2) + return None; + + bool BigEndian = true, LittleEndian = true; + for (unsigned i = 0; i < Width; i++) { + int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; + LittleEndian &= CurrentByteOffset == LittleEndianByteAt(Width, i); + BigEndian &= CurrentByteOffset == BigEndianByteAt(Width, i); + if (!BigEndian && !LittleEndian) + return None; + } + + assert((BigEndian != LittleEndian) && "It should be either big endian or" + "little endian"); + return BigEndian; +} + +static SDValue stripTruncAndExt(SDValue Value) { + switch (Value.getOpcode()) { + case ISD::TRUNCATE: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: + return stripTruncAndExt(Value.getOperand(0)); + } + return Value; +} + +/// Match a pattern where a wide type scalar value is stored by several narrow +/// stores. Fold it into a single store or a BSWAP and a store if the targets +/// supports it. +/// +/// Assuming little endian target: +/// i8 *p = ... +/// i32 val = ... +/// p[0] = (val >> 0) & 0xFF; +/// p[1] = (val >> 8) & 0xFF; +/// p[2] = (val >> 16) & 0xFF; +/// p[3] = (val >> 24) & 0xFF; +/// => +/// *((i32)p) = val; +/// +/// i8 *p = ... +/// i32 val = ... +/// p[0] = (val >> 24) & 0xFF; +/// p[1] = (val >> 16) & 0xFF; +/// p[2] = (val >> 8) & 0xFF; +/// p[3] = (val >> 0) & 0xFF; +/// => +/// *((i32)p) = BSWAP(val); +SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) { + // Collect all the stores in the chain. + SDValue Chain; + SmallVector<StoreSDNode *, 8> Stores; + for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) { + if (Store->getMemoryVT() != MVT::i8 || + Store->isVolatile() || Store->isIndexed()) + return SDValue(); + Stores.push_back(Store); + Chain = Store->getChain(); + } + // Handle the simple type only. + unsigned Width = Stores.size(); + EVT VT = EVT::getIntegerVT( + *DAG.getContext(), Width * N->getMemoryVT().getSizeInBits()); + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT)) + return SDValue(); + + // Check if all the bytes of the combined value we are looking at are stored + // to the same base address. Collect bytes offsets from Base address into + // ByteOffsets. + SDValue CombinedValue; + SmallVector<int64_t, 4> ByteOffsets(Width, INT64_MAX); + int64_t FirstOffset = INT64_MAX; + StoreSDNode *FirstStore = nullptr; + Optional<BaseIndexOffset> Base; + for (auto Store : Stores) { + // All the stores store different byte of the CombinedValue. A truncate is + // required to get that byte value. + SDValue Trunc = Store->getValue(); + if (Trunc.getOpcode() != ISD::TRUNCATE) + return SDValue(); + // A shift operation is required to get the right byte offset, except the + // first byte. + int64_t Offset = 0; + SDValue Value = Trunc.getOperand(0); + if (Value.getOpcode() == ISD::SRL || + Value.getOpcode() == ISD::SRA) { + ConstantSDNode *ShiftOffset = + dyn_cast<ConstantSDNode>(Value.getOperand(1)); + // Trying to match the following pattern. The shift offset must be + // a constant and a multiple of 8. It is the byte offset in "y". + // + // x = srl y, offset + // i8 z = trunc x + // store z, ... + if (!ShiftOffset || (ShiftOffset->getSExtValue() % 8)) + return SDValue(); + + Offset = ShiftOffset->getSExtValue()/8; + Value = Value.getOperand(0); + } + + // Stores must share the same combined value with different offsets. + if (!CombinedValue) + CombinedValue = Value; + else if (stripTruncAndExt(CombinedValue) != stripTruncAndExt(Value)) + return SDValue(); + + // The trunc and all the extend operation should be stripped to get the + // real value we are stored. + else if (CombinedValue.getValueType() != VT) { + if (Value.getValueType() == VT || + Value.getValueSizeInBits() > CombinedValue.getValueSizeInBits()) + CombinedValue = Value; + // Give up if the combined value type is smaller than the store size. + if (CombinedValue.getValueSizeInBits() < VT.getSizeInBits()) + return SDValue(); + } + + // Stores must share the same base address + BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG); + int64_t ByteOffsetFromBase = 0; + if (!Base) + Base = Ptr; + else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) + return SDValue(); + + // Remember the first byte store + if (ByteOffsetFromBase < FirstOffset) { + FirstStore = Store; + FirstOffset = ByteOffsetFromBase; + } + // Map the offset in the store and the offset in the combined value, and + // early return if it has been set before. + if (Offset < 0 || Offset >= Width || ByteOffsets[Offset] != INT64_MAX) + return SDValue(); + ByteOffsets[Offset] = ByteOffsetFromBase; + } + + assert(FirstOffset != INT64_MAX && "First byte offset must be set"); + assert(FirstStore && "First store must be set"); + + // Check if the bytes of the combined value we are looking at match with + // either big or little endian value store. + Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset); + if (!IsBigEndian.hasValue()) + return SDValue(); + + // The node we are looking at matches with the pattern, check if we can + // replace it with a single bswap if needed and store. + + // If the store needs byte swap check if the target supports it + bool NeedsBswap = DAG.getDataLayout().isBigEndian() != *IsBigEndian; + + // Before legalize we can introduce illegal bswaps which will be later + // converted to an explicit bswap sequence. This way we end up with a single + // store and byte shuffling instead of several stores and byte shuffling. + if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) + return SDValue(); + + // Check that a store of the wide type is both allowed and fast on the target + bool Fast = false; + bool Allowed = + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + *FirstStore->getMemOperand(), &Fast); + if (!Allowed || !Fast) + return SDValue(); + + if (VT != CombinedValue.getValueType()) { + assert(CombinedValue.getValueType().getSizeInBits() > VT.getSizeInBits() && + "Get unexpected store value to combine"); + CombinedValue = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, + CombinedValue); + } + + if (NeedsBswap) + CombinedValue = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, CombinedValue); + + SDValue NewStore = + DAG.getStore(Chain, SDLoc(N), CombinedValue, FirstStore->getBasePtr(), + FirstStore->getPointerInfo(), FirstStore->getAlignment()); + + // Rely on other DAG combine rules to remove the other individual stores. + DAG.ReplaceAllUsesWith(N, NewStore.getNode()); + return NewStore; +} + /// Match a pattern where a wide type scalar value is loaded by several narrow /// loads and combined by shifts and ors. Fold it into a single load or a load /// and a BSWAP if the targets supports it. @@ -5916,11 +6688,6 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); - std::function<unsigned(unsigned, unsigned)> LittleEndianByteAt = []( - unsigned BW, unsigned i) { return i; }; - std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = []( - unsigned BW, unsigned i) { return BW - i - 1; }; - bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto MemoryByteOffset = [&] (ByteProvider P) { assert(P.isMemory() && "Must be a memory byte provider"); @@ -5987,15 +6754,10 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // Check if the bytes of the OR we are looking at match with either big or // little endian value load - bool BigEndian = true, LittleEndian = true; - for (unsigned i = 0; i < ByteWidth; i++) { - int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; - LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i); - BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i); - if (!BigEndian && !LittleEndian) - return SDValue(); - } - assert((BigEndian != LittleEndian) && "should be either or"); + Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset); + if (!IsBigEndian.hasValue()) + return SDValue(); + assert(FirstByteProvider && "must be set"); // Ensure that the first byte is loaded from zero offset of the first load. @@ -6008,7 +6770,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // replace it with a single load and bswap if needed. // If the load needs byte swap check if the target supports it - bool NeedsBswap = IsBigEndianTarget != BigEndian; + bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; // Before legalize we can introduce illegal bswaps which will be later // converted to an explicit bswap sequence. This way we end up with a single @@ -6019,8 +6781,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // Check that a load of the wide type is both allowed and fast on the target bool Fast = false; bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - VT, FirstLoad->getAddressSpace(), - FirstLoad->getAlignment(), &Fast); + VT, *FirstLoad->getMemOperand(), &Fast); if (!Allowed || !Fast) return SDValue(); @@ -6160,7 +6921,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { return NewSel; // reassociate xor - if (SDValue RXOR = ReassociateOps(ISD::XOR, DL, N0, N1, N->getFlags())) + if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags())) return RXOR; // fold !(x cc y) -> (x !cc y) @@ -6218,6 +6979,16 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { return DAG.getNode(NewOpcode, DL, VT, LHS, RHS); } } + + // fold (not (neg x)) -> (add X, -1) + // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if + // Y is a constant or the subtract has a single use. + if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB && + isNullConstant(N0.getOperand(0))) { + return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), + DAG.getAllOnesConstant(DL, VT)); + } + // fold (xor (and x, y), y) -> (and (not x), y) if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) { SDValue X = N0.getOperand(0); @@ -6310,11 +7081,16 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { /// Handle transforms common to the three shifts, when the shift amount is a /// constant. +/// We are looking for: (shift being one of shl/sra/srl) +/// shift (binop X, C0), C1 +/// And want to transform into: +/// binop (shift X, C1), (shift C0, C1) SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) { // Do not turn a 'not' into a regular xor. if (isBitwiseNot(N->getOperand(0))) return SDValue(); + // The inner binop must be one-use, since we want to replace it. SDNode *LHS = N->getOperand(0).getNode(); if (!LHS->hasOneUse()) return SDValue(); @@ -6322,56 +7098,43 @@ SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) { // instead of (shift (and)), likewise for add, or, xor, etc. This sort of // thing happens with address calculations, so it's important to canonicalize // it. - bool HighBitSet = false; // Can we transform this if the high bit is set? - switch (LHS->getOpcode()) { - default: return SDValue(); + default: + return SDValue(); case ISD::OR: case ISD::XOR: - HighBitSet = false; // We can only transform sra if the high bit is clear. - break; case ISD::AND: - HighBitSet = true; // We can only transform sra if the high bit is set. break; case ISD::ADD: if (N->getOpcode() != ISD::SHL) return SDValue(); // only shl(add) not sr[al](add). - HighBitSet = false; // We can only transform sra if the high bit is clear. break; } // We require the RHS of the binop to be a constant and not opaque as well. ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS->getOperand(1)); - if (!BinOpCst) return SDValue(); + if (!BinOpCst) + return SDValue(); // FIXME: disable this unless the input to the binop is a shift by a constant - // or is copy/select.Enable this in other cases when figure out it's exactly profitable. - SDNode *BinOpLHSVal = LHS->getOperand(0).getNode(); - bool isShift = BinOpLHSVal->getOpcode() == ISD::SHL || - BinOpLHSVal->getOpcode() == ISD::SRA || - BinOpLHSVal->getOpcode() == ISD::SRL; - bool isCopyOrSelect = BinOpLHSVal->getOpcode() == ISD::CopyFromReg || - BinOpLHSVal->getOpcode() == ISD::SELECT; - - if ((!isShift || !isa<ConstantSDNode>(BinOpLHSVal->getOperand(1))) && - !isCopyOrSelect) + // or is copy/select. Enable this in other cases when figure out it's exactly + // profitable. + SDValue BinOpLHSVal = LHS->getOperand(0); + bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL || + BinOpLHSVal.getOpcode() == ISD::SRA || + BinOpLHSVal.getOpcode() == ISD::SRL) && + isa<ConstantSDNode>(BinOpLHSVal.getOperand(1)); + bool IsCopyOrSelect = BinOpLHSVal.getOpcode() == ISD::CopyFromReg || + BinOpLHSVal.getOpcode() == ISD::SELECT; + + if (!IsShiftByConstant && !IsCopyOrSelect) return SDValue(); - if (isCopyOrSelect && N->hasOneUse()) + if (IsCopyOrSelect && N->hasOneUse()) return SDValue(); EVT VT = N->getValueType(0); - // If this is a signed shift right, and the high bit is modified by the - // logical operation, do not perform the transformation. The highBitSet - // boolean indicates the value of the high bit of the constant which would - // cause it to be modified for this operation. - if (N->getOpcode() == ISD::SRA) { - bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative(); - if (BinOpRHSSignSet != HighBitSet) - return SDValue(); - } - if (!TLI.isDesirableToCommuteWithShift(N, Level)) return SDValue(); @@ -6395,11 +7158,12 @@ SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { assert(N->getOperand(0).getOpcode() == ISD::AND); // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC) - if (N->hasOneUse() && N->getOperand(0).hasOneUse()) { + EVT TruncVT = N->getValueType(0); + if (N->hasOneUse() && N->getOperand(0).hasOneUse() && + TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) { SDValue N01 = N->getOperand(0).getOperand(1); if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) { SDLoc DL(N); - EVT TruncVT = N->getValueType(0); SDValue N00 = N->getOperand(0).getOperand(0); SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00); SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01); @@ -6431,6 +7195,7 @@ SDValue DAGCombiner::visitRotate(SDNode *N) { } // fold (rot x, c) -> (rot x, c % BitSize) + // TODO - support non-uniform vector amounts. if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) { if (Cst->getAPIntValue().uge(Bitsize)) { uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize); @@ -6476,6 +7241,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { return V; EVT VT = N0.getValueType(); + EVT ShiftVT = N1.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); // fold vector ops @@ -6506,6 +7272,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (shl c1, c2) -> c1<<c2 + // TODO - support non-uniform vector shift amounts. ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C); @@ -6517,6 +7284,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) return DAG.getConstant(0, SDLoc(N), VT); + // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { @@ -6524,6 +7292,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); } + // TODO - support non-uniform vector shift amounts. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -6548,69 +7317,86 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { }; if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { SDLoc DL(N); - EVT ShiftVT = N1.getValueType(); SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum); } } - // fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2))) + // fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2)) // For this to be valid, the second form must not preserve any of the bits // that are shifted out by the inner shift in the first form. This means // the outer shift size must be >= the number of bits added by the ext. // As a corollary, we don't care what kind of ext it is. - if (N1C && (N0.getOpcode() == ISD::ZERO_EXTEND || - N0.getOpcode() == ISD::ANY_EXTEND || - N0.getOpcode() == ISD::SIGN_EXTEND) && + if ((N0.getOpcode() == ISD::ZERO_EXTEND || + N0.getOpcode() == ISD::ANY_EXTEND || + N0.getOpcode() == ISD::SIGN_EXTEND) && N0.getOperand(0).getOpcode() == ISD::SHL) { SDValue N0Op0 = N0.getOperand(0); - if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) { - APInt c1 = N0Op0C1->getAPIntValue(); - APInt c2 = N1C->getAPIntValue(); - zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); + SDValue InnerShiftAmt = N0Op0.getOperand(1); + EVT InnerVT = N0Op0.getValueType(); + uint64_t InnerBitwidth = InnerVT.getScalarSizeInBits(); - EVT InnerShiftVT = N0Op0.getValueType(); - uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); - if (c2.uge(OpSizeInBits - InnerShiftSize)) { - SDLoc DL(N0); - APInt Sum = c1 + c2; - if (Sum.uge(OpSizeInBits)) - return DAG.getConstant(0, DL, VT); + auto MatchOutOfRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + APInt c1 = LHS->getAPIntValue(); + APInt c2 = RHS->getAPIntValue(); + zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); + return c2.uge(OpSizeInBits - InnerBitwidth) && + (c1 + c2).uge(OpSizeInBits); + }; + if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchOutOfRange, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) + return DAG.getConstant(0, SDLoc(N), VT); - return DAG.getNode( - ISD::SHL, DL, VT, - DAG.getNode(N0.getOpcode(), DL, VT, N0Op0->getOperand(0)), - DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType())); - } + auto MatchInRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + APInt c1 = LHS->getAPIntValue(); + APInt c2 = RHS->getAPIntValue(); + zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); + return c2.uge(OpSizeInBits - InnerBitwidth) && + (c1 + c2).ult(OpSizeInBits); + }; + if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchInRange, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDLoc DL(N); + SDValue Ext = DAG.getNode(N0.getOpcode(), DL, VT, N0Op0.getOperand(0)); + SDValue Sum = DAG.getZExtOrTrunc(InnerShiftAmt, DL, ShiftVT); + Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, Sum, N1); + return DAG.getNode(ISD::SHL, DL, VT, Ext, Sum); } } // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C)) // Only fold this if the inner zext has no other uses to avoid increasing // the total number of instructions. - if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() && + if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::SRL) { SDValue N0Op0 = N0.getOperand(0); - if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) { - if (N0Op0C1->getAPIntValue().ult(VT.getScalarSizeInBits())) { - uint64_t c1 = N0Op0C1->getZExtValue(); - uint64_t c2 = N1C->getZExtValue(); - if (c1 == c2) { - SDValue NewOp0 = N0.getOperand(0); - EVT CountVT = NewOp0.getOperand(1).getValueType(); - SDLoc DL(N); - SDValue NewSHL = DAG.getNode(ISD::SHL, DL, NewOp0.getValueType(), - NewOp0, - DAG.getConstant(c2, DL, CountVT)); - AddToWorklist(NewSHL.getNode()); - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL); - } - } + SDValue InnerShiftAmt = N0Op0.getOperand(1); + + auto MatchEqual = [VT](ConstantSDNode *LHS, ConstantSDNode *RHS) { + APInt c1 = LHS->getAPIntValue(); + APInt c2 = RHS->getAPIntValue(); + zeroExtendToMatch(c1, c2); + return c1.ult(VT.getScalarSizeInBits()) && (c1 == c2); + }; + if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchEqual, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDLoc DL(N); + EVT InnerShiftAmtVT = N0Op0.getOperand(1).getValueType(); + SDValue NewSHL = DAG.getZExtOrTrunc(N1, DL, InnerShiftAmtVT); + NewSHL = DAG.getNode(ISD::SHL, DL, N0Op0.getValueType(), N0Op0, NewSHL); + AddToWorklist(NewSHL.getNode()); + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL); } } // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 + // TODO - support non-uniform vector shift amounts. if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) && N0->getFlags().hasExact()) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { @@ -6619,9 +7405,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { SDLoc DL(N); if (C1 <= C2) return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), - DAG.getConstant(C2 - C1, DL, N1.getValueType())); + DAG.getConstant(C2 - C1, DL, ShiftVT)); return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), - DAG.getConstant(C1 - C2, DL, N1.getValueType())); + DAG.getConstant(C1 - C2, DL, ShiftVT)); } } @@ -6629,11 +7415,13 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // (and (srl x, (sub c1, c2), MASK) // Only fold this if the inner shift has no other uses -- if it does, folding // this will increase the total number of instructions. + // TODO - drop hasOneUse requirement if c1 == c2? + // TODO - support non-uniform vector shift amounts. if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() && - TLI.shouldFoldShiftPairToMask(N, Level)) { + TLI.shouldFoldConstantShiftPairToMask(N, Level)) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { - uint64_t c1 = N0C1->getZExtValue(); - if (c1 < OpSizeInBits) { + if (N0C1->getAPIntValue().ult(OpSizeInBits)) { + uint64_t c1 = N0C1->getZExtValue(); uint64_t c2 = N1C->getZExtValue(); APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1); SDValue Shift; @@ -6641,12 +7429,12 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { Mask <<= c2 - c1; SDLoc DL(N); Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), - DAG.getConstant(c2 - c1, DL, N1.getValueType())); + DAG.getConstant(c2 - c1, DL, ShiftVT)); } else { Mask.lshrInPlace(c1 - c2); SDLoc DL(N); Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), - DAG.getConstant(c1 - c2, DL, N1.getValueType())); + DAG.getConstant(c1 - c2, DL, ShiftVT)); } SDLoc DL(N0); return DAG.getNode(ISD::AND, DL, VT, Shift, @@ -6719,6 +7507,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (sra c1, c2) -> (sra c1, c2) + // TODO - support non-uniform vector shift amounts. ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C); @@ -6815,32 +7604,32 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1); } + // fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2)) // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) // if c1 is equal to the number of bits the trunc removes + // TODO - support non-uniform vector shift amounts. if (N0.getOpcode() == ISD::TRUNCATE && (N0.getOperand(0).getOpcode() == ISD::SRL || N0.getOperand(0).getOpcode() == ISD::SRA) && N0.getOperand(0).hasOneUse() && - N0.getOperand(0).getOperand(1).hasOneUse() && - N1C) { + N0.getOperand(0).getOperand(1).hasOneUse() && N1C) { SDValue N0Op0 = N0.getOperand(0); if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) { - unsigned LargeShiftVal = LargeShift->getZExtValue(); EVT LargeVT = N0Op0.getValueType(); - - if (LargeVT.getScalarSizeInBits() - OpSizeInBits == LargeShiftVal) { + unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits; + if (LargeShift->getAPIntValue() == TruncBits) { SDLoc DL(N); - SDValue Amt = - DAG.getConstant(LargeShiftVal + N1C->getZExtValue(), DL, - getShiftAmountTy(N0Op0.getOperand(0).getValueType())); - SDValue SRA = DAG.getNode(ISD::SRA, DL, LargeVT, - N0Op0.getOperand(0), Amt); + SDValue Amt = DAG.getConstant(N1C->getZExtValue() + TruncBits, DL, + getShiftAmountTy(LargeVT)); + SDValue SRA = + DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt); return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA); } } } // Simplify, based on bits shifted out of the LHS. + // TODO - support non-uniform vector shift amounts. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -6872,6 +7661,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (srl c1, c2) -> c1 >>u c2 + // TODO - support non-uniform vector shift amounts. ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C); @@ -6912,6 +7702,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { } // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2))) + // TODO - support non-uniform vector shift amounts. if (N1C && N0.getOpcode() == ISD::TRUNCATE && N0.getOperand(0).getOpcode() == ISD::SRL) { if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) { @@ -6935,6 +7726,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { } // fold (srl (shl x, c), c) -> (and x, cst2) + // TODO - (srl (shl x, c1), c2). if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && isConstantOrConstantVector(N1, /* NoOpaques */ true)) { SDLoc DL(N); @@ -6945,11 +7737,12 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { } // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask) + // TODO - support non-uniform vector shift amounts. if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { // Shifting in all undef bits? EVT SmallVT = N0.getOperand(0).getValueType(); unsigned BitSize = SmallVT.getScalarSizeInBits(); - if (N1C->getZExtValue() >= BitSize) + if (N1C->getAPIntValue().uge(BitSize)) return DAG.getUNDEF(VT); if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) { @@ -6970,7 +7763,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign // bit, which is unmodified by sra. - if (N1C && N1C->getZExtValue() + 1 == OpSizeInBits) { + if (N1C && N1C->getAPIntValue() == (OpSizeInBits - 1)) { if (N0.getOpcode() == ISD::SRA) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1); } @@ -7021,6 +7814,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold operands of srl based on knowledge that the low bits are not // demanded. + // TODO - support non-uniform vector shift amounts. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -7079,13 +7873,49 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) { N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1))) return IsFSHL ? N0 : N1; - // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth) + auto IsUndefOrZero = [](SDValue V) { + return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true); + }; + + // TODO - support non-uniform vector shift amounts. if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) { + EVT ShAmtTy = N2.getValueType(); + + // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth) if (Cst->getAPIntValue().uge(BitWidth)) { uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth); return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1, - DAG.getConstant(RotAmt, SDLoc(N), N2.getValueType())); + DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy)); } + + unsigned ShAmt = Cst->getZExtValue(); + if (ShAmt == 0) + return IsFSHL ? N0 : N1; + + // fold fshl(undef_or_zero, N1, C) -> lshr(N1, BW-C) + // fold fshr(undef_or_zero, N1, C) -> lshr(N1, C) + // fold fshl(N0, undef_or_zero, C) -> shl(N0, C) + // fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C) + if (IsUndefOrZero(N0)) + return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, + DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt, + SDLoc(N), ShAmtTy)); + if (IsUndefOrZero(N1)) + return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, + DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt, + SDLoc(N), ShAmtTy)); + } + + // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2) + // fold fshl(N0, undef_or_zero, N2) -> shl(N0, N2) + // iff We know the shift amount is in range. + // TODO: when is it worth doing SUB(BW, N2) as well? + if (isPowerOf2_32(BitWidth)) { + APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1); + if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits)) + return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2); + if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits)) + return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2); } // fold (fshl N0, N0, N2) -> (rotl N0, N2) @@ -7096,6 +7926,10 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) { if (N0 == N1 && hasOperation(RotOpc, VT)) return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2); + // Simplify, based on bits shifted out of N0/N1. + if (SimplifyDemandedBits(SDValue(N, 0))) + return SDValue(N, 0); + return SDValue(); } @@ -7207,11 +8041,14 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) { // FIXME: This should be checking for no signed zeros on individual operands, as // well as no nans. -static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS, SDValue RHS) { +static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS, + SDValue RHS, + const TargetLowering &TLI) { const TargetOptions &Options = DAG.getTarget().Options; EVT VT = LHS.getValueType(); return Options.NoSignedZerosFPMath && VT.isFloatingPoint() && + TLI.isProfitableToCombineMinNumMaxNum(VT) && DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS); } @@ -7364,6 +8201,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { EVT VT = N->getValueType(0); EVT VT0 = N0.getValueType(); SDLoc DL(N); + SDNodeFlags Flags = N->getFlags(); if (SDValue V = DAG.simplifySelect(N0, N1, N2)) return V; @@ -7414,20 +8252,26 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue Cond0 = N0->getOperand(0); SDValue Cond1 = N0->getOperand(1); SDValue InnerSelect = - DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2); + DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2, Flags); if (normalizeToSequence || !InnerSelect.use_empty()) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, - InnerSelect, N2); + InnerSelect, N2, Flags); + // Cleanup on failure. + if (InnerSelect.use_empty()) + recursivelyDeleteUnusedNodes(InnerSelect.getNode()); } // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y) if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { SDValue Cond0 = N0->getOperand(0); SDValue Cond1 = N0->getOperand(1); - SDValue InnerSelect = - DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2); + SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(), + Cond1, N1, N2, Flags); if (normalizeToSequence || !InnerSelect.use_empty()) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1, - InnerSelect); + InnerSelect, Flags); + // Cleanup on failure. + if (InnerSelect.use_empty()) + recursivelyDeleteUnusedNodes(InnerSelect.getNode()); } // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y @@ -7439,12 +8283,14 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // Create the actual and node if we can generate good code for it. if (!normalizeToSequence) { SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); - return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, N2); + return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, + N2, Flags); } // Otherwise see if we can optimize the "and" to a better pattern. - if (SDValue Combined = visitANDLike(N0, N1_0, N)) + if (SDValue Combined = visitANDLike(N0, N1_0, N)) { return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1, - N2); + N2, Flags); + } } } // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y @@ -7456,20 +8302,22 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // Create the actual or node if we can generate good code for it. if (!normalizeToSequence) { SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); - return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, N2_2); + return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, + N2_2, Flags); } // Otherwise see if we can optimize to a better pattern. if (SDValue Combined = visitORLike(N0, N2_0, N)) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1, - N2_2); + N2_2, Flags); } } } - if (VT0 == MVT::i1) { - // select (not Cond), N1, N2 -> select Cond, N2, N1 - if (isBitwiseNot(N0)) - return DAG.getNode(ISD::SELECT, DL, VT, N0->getOperand(0), N2, N1); + // select (not Cond), N1, N2 -> select Cond, N2, N1 + if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) { + SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1); + SelectOp->setFlags(Flags); + return SelectOp; } // Fold selects based on a setcc into other things, such as min/max/abs. @@ -7481,7 +8329,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // select (fcmp gt x, y), x, y -> fmaxnum x, y // // This is OK if we don't care what happens if either operand is a NaN. - if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2)) + if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI)) if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2, CC, TLI, DAG)) return FMinMax; @@ -7516,9 +8364,16 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { } if (TLI.isOperationLegal(ISD::SELECT_CC, VT) || - (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) - return DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, N2, - N0.getOperand(2)); + (!LegalOperations && + TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) { + // Any flags available in a select/setcc fold will be on the setcc as they + // migrated from fcmp + Flags = N0.getNode()->getFlags(); + SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1, + N2, N0.getOperand(2)); + SelectNode->setFlags(Flags); + return SelectNode; + } return SimplifySelect(DL, N0, N1, N2); } @@ -7599,14 +8454,19 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { } SDValue DAGCombiner::visitMSCATTER(SDNode *N) { - if (Level >= AfterLegalizeTypes) - return SDValue(); - MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N); SDValue Mask = MSC->getMask(); - SDValue Data = MSC->getValue(); + SDValue Data = MSC->getValue(); + SDValue Chain = MSC->getChain(); SDLoc DL(N); + // Zap scatters with a zero mask. + if (ISD::isBuildVectorAllZeros(Mask.getNode())) + return Chain; + + if (Level >= AfterLegalizeTypes) + return SDValue(); + // If the MSCATTER data type requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons @@ -7624,8 +8484,6 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) { EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0)); - SDValue Chain = MSC->getChain(); - EVT MemoryVT = MSC->getMemoryVT(); unsigned Alignment = MSC->getOriginalAlignment(); @@ -7658,15 +8516,20 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) { } SDValue DAGCombiner::visitMSTORE(SDNode *N) { - if (Level >= AfterLegalizeTypes) - return SDValue(); - - MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N); + MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N); SDValue Mask = MST->getMask(); - SDValue Data = MST->getValue(); + SDValue Data = MST->getValue(); + SDValue Chain = MST->getChain(); EVT VT = Data.getValueType(); SDLoc DL(N); + // Zap masked stores with a zero mask. + if (ISD::isBuildVectorAllZeros(Mask.getNode())) + return Chain; + + if (Level >= AfterLegalizeTypes) + return SDValue(); + // If the MSTORE data type requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons @@ -7680,17 +8543,11 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); - SDValue Chain = MST->getChain(); SDValue Ptr = MST->getBasePtr(); EVT MemoryVT = MST->getMemoryVT(); unsigned Alignment = MST->getOriginalAlignment(); - // if Alignment is equal to the vector size, - // take the half of it for the second part - unsigned SecondHalfAlignment = - (Alignment == VT.getSizeInBits() / 8) ? Alignment / 2 : Alignment; - EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); @@ -7712,7 +8569,7 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { MMO = DAG.getMachineFunction().getMachineMemOperand( MST->getPointerInfo().getWithOffset(HiOffset), - MachineMemOperand::MOStore, HiMemVT.getStoreSize(), SecondHalfAlignment, + MachineMemOperand::MOStore, HiMemVT.getStoreSize(), Alignment, MST->getAAInfo(), MST->getRanges()); Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO, @@ -7728,13 +8585,17 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { } SDValue DAGCombiner::visitMGATHER(SDNode *N) { - if (Level >= AfterLegalizeTypes) - return SDValue(); - MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N); SDValue Mask = MGT->getMask(); SDLoc DL(N); + // Zap gathers with a zero mask. + if (ISD::isBuildVectorAllZeros(Mask.getNode())) + return CombineTo(N, MGT->getPassThru(), MGT->getChain()); + + if (Level >= AfterLegalizeTypes) + return SDValue(); + // If the MGATHER result requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons @@ -7805,13 +8666,17 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) { } SDValue DAGCombiner::visitMLOAD(SDNode *N) { - if (Level >= AfterLegalizeTypes) - return SDValue(); - - MaskedLoadSDNode *MLD = dyn_cast<MaskedLoadSDNode>(N); + MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N); SDValue Mask = MLD->getMask(); SDLoc DL(N); + // Zap masked loads with a zero mask. + if (ISD::isBuildVectorAllZeros(Mask.getNode())) + return CombineTo(N, MLD->getPassThru(), MLD->getChain()); + + if (Level >= AfterLegalizeTypes) + return SDValue(); + // If the MLOAD result requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons @@ -7839,12 +8704,6 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { EVT MemoryVT = MLD->getMemoryVT(); unsigned Alignment = MLD->getOriginalAlignment(); - // if Alignment is equal to the vector size, - // take the half of it for the second part - unsigned SecondHalfAlignment = - (Alignment == MLD->getValueType(0).getSizeInBits()/8) ? - Alignment/2 : Alignment; - EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); @@ -7862,7 +8721,7 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { MMO = DAG.getMachineFunction().getMachineMemOperand( MLD->getPointerInfo().getWithOffset(HiOffset), - MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), SecondHalfAlignment, + MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), Alignment, MLD->getAAInfo(), MLD->getRanges()); Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, PassThruHi, HiMemVT, @@ -7943,11 +8802,16 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); + EVT VT = N->getValueType(0); SDLoc DL(N); if (SDValue V = DAG.simplifySelect(N0, N1, N2)) return V; + // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1 + if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) + return DAG.getSelect(DL, VT, F, N2, N1); + // Canonicalize integer abs. // vselect (setg[te] X, 0), X, -X -> // vselect (setgt X, -1), X, -X -> @@ -7987,11 +8851,10 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { // This is OK if we don't care about what happens if either operand is a // NaN. // - EVT VT = N->getValueType(0); - if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N0.getOperand(0), N0.getOperand(1))) { - ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); + if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N0.getOperand(0), + N0.getOperand(1), TLI)) { if (SDValue FMinMax = combineMinNumMaxNum( - DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG)) + DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG)) return FMinMax; } @@ -8080,9 +8943,11 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) { return N2; } else if (SCC.getOpcode() == ISD::SETCC) { // Fold to a simpler select_cc - return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(), - SCC.getOperand(0), SCC.getOperand(1), N2, N3, - SCC.getOperand(2)); + SDValue SelectOp = DAG.getNode( + ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0), + SCC.getOperand(1), N2, N3, SCC.getOperand(2)); + SelectOp->setFlags(SCC->getFlags()); + return SelectOp; } } @@ -8148,6 +9013,7 @@ static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || @@ -8158,7 +9024,33 @@ static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, // fold (zext c1) -> c1 // fold (aext c1) -> c1 if (isa<ConstantSDNode>(N0)) - return DAG.getNode(Opcode, SDLoc(N), VT, N0); + return DAG.getNode(Opcode, DL, VT, N0); + + // fold (sext (select cond, c1, c2)) -> (select cond, sext c1, sext c2) + // fold (zext (select cond, c1, c2)) -> (select cond, zext c1, zext c2) + // fold (aext (select cond, c1, c2)) -> (select cond, sext c1, sext c2) + if (N0->getOpcode() == ISD::SELECT) { + SDValue Op1 = N0->getOperand(1); + SDValue Op2 = N0->getOperand(2); + if (isa<ConstantSDNode>(Op1) && isa<ConstantSDNode>(Op2) && + (Opcode != ISD::ZERO_EXTEND || !TLI.isZExtFree(N0.getValueType(), VT))) { + // For any_extend, choose sign extension of the constants to allow a + // possible further transform to sign_extend_inreg.i.e. + // + // t1: i8 = select t0, Constant:i8<-1>, Constant:i8<0> + // t2: i64 = any_extend t1 + // --> + // t3: i64 = select t0, Constant:i64<-1>, Constant:i64<0> + // --> + // t4: i64 = sign_extend_inreg t3 + unsigned FoldOpc = Opcode; + if (FoldOpc == ISD::ANY_EXTEND) + FoldOpc = ISD::SIGN_EXTEND; + return DAG.getSelect(DL, VT, N0->getOperand(0), + DAG.getNode(FoldOpc, DL, VT, Op1), + DAG.getNode(FoldOpc, DL, VT, Op2)); + } + } // fold (sext (build_vector AllConstants) -> (build_vector AllConstants) // fold (zext (build_vector AllConstants) -> (build_vector AllConstants) @@ -8173,7 +9065,6 @@ static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits(); SmallVector<SDValue, 8> Elts; unsigned NumElts = VT.getVectorNumElements(); - SDLoc DL(N); // For zero-extensions, UNDEF elements still guarantee to have the upper // bits set to zero. @@ -8387,6 +9278,9 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) { assert(N->getOpcode() == ISD::ZERO_EXTEND); EVT VT = N->getValueType(0); + EVT OrigVT = N->getOperand(0).getValueType(); + if (TLI.isZExtFree(OrigVT, VT)) + return SDValue(); // and/or/xor SDValue N0 = N->getOperand(0); @@ -8450,6 +9344,10 @@ SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) { Load->getValueType(0), ExtLoad); CombineTo(Load, Trunc, ExtLoad.getValue(1)); } + + // N0 is dead at this point. + recursivelyDeleteUnusedNodes(N0.getNode()); + return SDValue(N,0); // Return N so it doesn't get rechecked! } @@ -8509,19 +9407,21 @@ static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner, : ISD::isZEXTLoad(N0Node); if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) || !ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse()) - return {}; + return SDValue(); LoadSDNode *LN0 = cast<LoadSDNode>(N0); EVT MemVT = LN0->getMemoryVT(); if ((LegalOperations || LN0->isVolatile() || VT.isVector()) && !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT)) - return {}; + return SDValue(); SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); Combiner.CombineTo(N, ExtLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + if (LN0->use_empty()) + Combiner.recursivelyDeleteUnusedNodes(LN0); return SDValue(N, 0); // Return N so it doesn't get rechecked! } @@ -8559,6 +9459,7 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner, Combiner.CombineTo(N, ExtLoad); if (NoReplaceTrunc) { DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + Combiner.recursivelyDeleteUnusedNodes(LN0); } else { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); @@ -8804,6 +9705,25 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; + // Eliminate this sign extend by doing a negation in the destination type: + // sext i32 (0 - (zext i8 X to i32)) to i64 --> 0 - (zext i8 X to i64) + if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && + isNullOrNullSplat(N0.getOperand(0)) && + N0.getOperand(1).getOpcode() == ISD::ZERO_EXTEND && + TLI.isOperationLegalOrCustom(ISD::SUB, VT)) { + SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(1).getOperand(0), DL, VT); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Zext); + } + // Eliminate this sign extend by doing a decrement in the destination type: + // sext i32 ((zext i8 X to i32) + (-1)) to i64 --> (zext i8 X to i64) + (-1) + if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && + isAllOnesOrAllOnesSplat(N0.getOperand(1)) && + N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && + TLI.isOperationLegalOrCustom(ISD::ADD, VT)) { + SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT); + return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT)); + } + return SDValue(); } @@ -9061,14 +9981,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse()) { SDValue ShAmt = N0.getOperand(1); - unsigned ShAmtVal = cast<ConstantSDNode>(ShAmt)->getZExtValue(); if (N0.getOpcode() == ISD::SHL) { SDValue InnerZExt = N0.getOperand(0); // If the original shl may be shifting out bits, do not perform this // transformation. unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() - InnerZExt.getOperand(0).getValueSizeInBits(); - if (ShAmtVal > KnownZeroBits) + if (cast<ConstantSDNode>(ShAmt)->getAPIntValue().ugt(KnownZeroBits)) return SDValue(); } @@ -9162,6 +10081,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { CombineTo(N, ExtLoad); if (NoReplaceTrunc) { DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + recursivelyDeleteUnusedNodes(LN0); } else { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); @@ -9185,6 +10105,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); + recursivelyDeleteUnusedNodes(LN0); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } @@ -9574,14 +10495,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible. // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above. if (N0.getOpcode() == ISD::SRL) { - if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1))) - if (ShAmt->getZExtValue()+EVTBits <= VTBits) { + if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1))) + if (ShAmt->getAPIntValue().ule(VTBits - EVTBits)) { // We can turn this into an SRA iff the input to the SRL is already sign // extended enough. unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0)); - if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits) - return DAG.getNode(ISD::SRA, SDLoc(N), VT, - N0.getOperand(0), N0.getOperand(1)); + if (((VTBits - EVTBits) - ShAmt->getZExtValue()) < InSignBits) + return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0), + N0.getOperand(1)); } } @@ -9667,10 +10588,11 @@ SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) { SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + EVT SrcVT = N0.getValueType(); bool isLE = DAG.getDataLayout().isLittleEndian(); // noop truncate - if (N0.getValueType() == N->getValueType(0)) + if (SrcVT == VT) return N0; // fold (truncate (truncate x)) -> (truncate x) @@ -9740,7 +10662,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // trunc (select c, a, b) -> select c, (trunc a), (trunc b) if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) { - EVT SrcVT = N0.getValueType(); if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) && TLI.isTruncateFree(SrcVT, VT)) { SDLoc SL(N0); @@ -9753,7 +10674,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits() if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() && - (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) && + (!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) && TLI.isTypeDesirableForOp(ISD::SHL, VT)) { SDValue Amt = N0.getOperand(1); KnownBits Known = DAG.computeKnownBits(Amt); @@ -9771,6 +10692,19 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } } + // Attempt to pre-truncate BUILD_VECTOR sources. + if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations && + TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType())) { + SDLoc DL(N); + EVT SVT = VT.getScalarType(); + SmallVector<SDValue, 8> TruncOps; + for (const SDValue &Op : N0->op_values()) { + SDValue TruncOp = DAG.getNode(ISD::TRUNCATE, DL, SVT, Op); + TruncOps.push_back(TruncOp); + } + return DAG.getBuildVector(VT, DL, TruncOps); + } + // Fold a series of buildvector, bitcast, and truncate if possible. // For example fold // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to @@ -9906,7 +10840,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // When the adde's carry is not used. if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) && N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) && - (!LegalOperations || TLI.isOperationLegal(N0.getOpcode(), VT))) { + // We only do for addcarry before legalize operation + ((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) || + TLI.isOperationLegal(N0.getOpcode(), VT))) { SDLoc SL(N); auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); @@ -10070,14 +11006,17 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { return DAG.getUNDEF(VT); // If the input is a BUILD_VECTOR with all constant elements, fold this now. - // Only do this before legalize types, since we might create an illegal - // scalar type. Even if we knew we wouldn't create an illegal scalar type - // we can only do this before legalize ops, since the target maybe - // depending on the bitcast. + // Only do this before legalize types, unless both types are integer and the + // scalar type is legal. Only do this before legalize ops, since the target + // maybe depending on the bitcast. // First check to see if this is all constant. - if (!LegalTypes && + // TODO: Support FP bitcasts after legalize types. + if (VT.isVector() && + (!LegalTypes || + (!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() && + TLI.isTypeLegal(VT.getVectorElementType()))) && N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() && - VT.isVector() && cast<BuildVectorSDNode>(N0)->isConstant()) + cast<BuildVectorSDNode>(N0)->isConstant()) return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), VT.getVectorElementType()); @@ -10113,18 +11052,14 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { // as we assume software couldn't rely on the number of accesses of an // illegal type. ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) || - TLI.isOperationLegal(ISD::LOAD, VT)) && - TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) { + TLI.isOperationLegal(ISD::LOAD, VT))) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); - unsigned OrigAlign = LN0->getAlignment(); - bool Fast = false; - if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - LN0->getAddressSpace(), OrigAlign, &Fast) && - Fast) { + if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG, + *LN0->getMemOperand())) { SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), - LN0->getPointerInfo(), OrigAlign, + LN0->getPointerInfo(), LN0->getAlignment(), LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); return Load; @@ -11071,15 +12006,17 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // fold (fadd A, (fneg B)) -> (fsub A, B) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && - isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2) + isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize) == 2) return DAG.getNode(ISD::FSUB, DL, VT, N0, - GetNegatedExpression(N1, DAG, LegalOperations), Flags); + GetNegatedExpression(N1, DAG, LegalOperations, + ForCodeSize), Flags); // fold (fadd (fneg A), B) -> (fsub B, A) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && - isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2) + isNegatibleForFree(N0, LegalOperations, TLI, &Options, ForCodeSize) == 2) return DAG.getNode(ISD::FSUB, DL, VT, N1, - GetNegatedExpression(N0, DAG, LegalOperations), Flags); + GetNegatedExpression(N0, DAG, LegalOperations, + ForCodeSize), Flags); auto isFMulNegTwo = [](SDValue FMul) { if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL) @@ -11105,8 +12042,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // Selection pass has a hard time dealing with FP constants. bool AllowNewConst = (Level < AfterLegalizeDAG); - // If 'unsafe math' or nnan is enabled, fold lots of things. - if ((Options.UnsafeFPMath || Flags.hasNoNaNs()) && AllowNewConst) { + // If nnan is enabled, fold lots of things. + if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) { // If allowed, fold (fadd (fneg x), x) -> 0.0 if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) return DAG.getConstantFP(0.0, DL, VT); @@ -11246,16 +12183,20 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { if (N0 == N1) { // (fsub x, x) -> 0.0 - if (Options.UnsafeFPMath || Flags.hasNoNaNs()) + if (Options.NoNaNsFPMath || Flags.hasNoNaNs()) return DAG.getConstantFP(0.0f, DL, VT); } // (fsub -0.0, N1) -> -N1 + // NOTE: It is safe to transform an FSUB(-0.0,X) into an FNEG(X), since the + // FSUB does not specify the sign bit of a NaN. Also note that for + // the same reason, the inverse transform is not safe, unless fast math + // flags are in play. if (N0CFP && N0CFP->isZero()) { if (N0CFP->isNegative() || (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) { - if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) - return GetNegatedExpression(N1, DAG, LegalOperations); + if (isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize)) + return GetNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags); } @@ -11273,9 +12214,10 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { } // fold (fsub A, (fneg B)) -> (fadd A, B) - if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) + if (isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize)) return DAG.getNode(ISD::FADD, DL, VT, N0, - GetNegatedExpression(N1, DAG, LegalOperations), Flags); + GetNegatedExpression(N1, DAG, LegalOperations, + ForCodeSize), Flags); // FSUB -> FMA combines: if (SDValue Fused = visitFSUBForFMACombine(N)) { @@ -11319,7 +12261,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; - if (Options.UnsafeFPMath || + if ((Options.NoNaNsFPMath && Options.NoSignedZerosFPMath) || (Flags.hasNoNaNs() && Flags.hasNoSignedZeros())) { // fold (fmul A, 0) -> 0 if (N1CFP && N1CFP->isZero()) @@ -11361,14 +12303,18 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { return DAG.getNode(ISD::FNEG, DL, VT, N0); // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y) - if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) { - if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) { + if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options, + ForCodeSize)) { + if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options, + ForCodeSize)) { // Both can be negated for free, check to see if at least one is cheaper // negated. if (LHSNeg == 2 || RHSNeg == 2) return DAG.getNode(ISD::FMUL, DL, VT, - GetNegatedExpression(N0, DAG, LegalOperations), - GetNegatedExpression(N1, DAG, LegalOperations), + GetNegatedExpression(N0, DAG, LegalOperations, + ForCodeSize), + GetNegatedExpression(N1, DAG, LegalOperations, + ForCodeSize), Flags); } } @@ -11506,7 +12452,8 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { // fma (fneg x), K, y -> fma x -K, y if (N0.getOpcode() == ISD::FNEG && (TLI.isOperationLegal(ISD::ConstantFP, VT) || - (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) { + (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT, + ForCodeSize)))) { return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2); } @@ -11541,22 +12488,33 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { + // TODO: Limit this transform based on optsize/minsize - it always creates at + // least 1 extra instruction. But the perf win may be substantial enough + // that only minsize should restrict this. bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; const SDNodeFlags Flags = N->getFlags(); if (!UnsafeMath && !Flags.hasAllowReciprocal()) return SDValue(); - // Skip if current node is a reciprocal. + // Skip if current node is a reciprocal/fneg-reciprocal. SDValue N0 = N->getOperand(0); - ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); - if (N0CFP && N0CFP->isExactlyValue(1.0)) + ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true); + if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0))) return SDValue(); // Exit early if the target does not want this transform or if there can't // possibly be enough uses of the divisor to make the transform worthwhile. SDValue N1 = N->getOperand(1); unsigned MinUses = TLI.combineRepeatedFPDivisors(); - if (!MinUses || N1->use_size() < MinUses) + + // For splat vectors, scale the number of uses by the splat factor. If we can + // convert the division into a scalar op, that will likely be much faster. + unsigned NumElts = 1; + EVT VT = N->getValueType(0); + if (VT.isVector() && DAG.isSplatValue(N1)) + NumElts = VT.getVectorNumElements(); + + if (!MinUses || (N1->use_size() * NumElts) < MinUses) return SDValue(); // Find all FDIV users of the same divisor. @@ -11573,10 +12531,9 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { // Now that we have the actual number of divisor uses, make sure it meets // the minimum threshold specified by the target. - if (Users.size() < MinUses) + if ((Users.size() * NumElts) < MinUses) return SDValue(); - EVT VT = N->getValueType(0); SDLoc DL(N); SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags); @@ -11619,6 +12576,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; + if (SDValue V = combineRepeatedFPDivisors(N)) + return V; + if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { @@ -11634,7 +12594,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { // backend)... we should handle this gracefully after Legalize. // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) || TLI.isOperationLegal(ISD::ConstantFP, VT) || - TLI.isFPImmLegal(Recip, VT))) + TLI.isFPImmLegal(Recip, VT, ForCodeSize))) return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getConstantFP(Recip, DL, VT), Flags); } @@ -11692,21 +12652,22 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { } // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) - if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) { - if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) { + if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options, + ForCodeSize)) { + if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options, + ForCodeSize)) { // Both can be negated for free, check to see if at least one is cheaper // negated. if (LHSNeg == 2 || RHSNeg == 2) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, - GetNegatedExpression(N0, DAG, LegalOperations), - GetNegatedExpression(N1, DAG, LegalOperations), + GetNegatedExpression(N0, DAG, LegalOperations, + ForCodeSize), + GetNegatedExpression(N1, DAG, LegalOperations, + ForCodeSize), Flags); } } - if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N)) - return CombineRepeatedDivisors; - return SDValue(); } @@ -11838,18 +12799,24 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0), Flags); } - // Try to convert x ** (1/4) into square roots. + // Try to convert x ** (1/4) and x ** (3/4) into square roots. // x ** (1/2) is canonicalized to sqrt, so we do not bother with that case. // TODO: This could be extended (using a target hook) to handle smaller // power-of-2 fractional exponents. - if (ExponentC->getValueAPF().isExactlyValue(0.25)) { + bool ExponentIs025 = ExponentC->getValueAPF().isExactlyValue(0.25); + bool ExponentIs075 = ExponentC->getValueAPF().isExactlyValue(0.75); + if (ExponentIs025 || ExponentIs075) { // pow(-0.0, 0.25) = +0.0; sqrt(sqrt(-0.0)) = -0.0. // pow(-inf, 0.25) = +inf; sqrt(sqrt(-inf)) = NaN. + // pow(-0.0, 0.75) = +0.0; sqrt(-0.0) * sqrt(sqrt(-0.0)) = +0.0. + // pow(-inf, 0.75) = +inf; sqrt(-inf) * sqrt(sqrt(-inf)) = NaN. // For regular numbers, rounding may cause the results to differ. // Therefore, we require { nsz ninf afn } for this transform. // TODO: We could select out the special cases if we don't have nsz/ninf. SDNodeFlags Flags = N->getFlags(); - if (!Flags.hasNoSignedZeros() || !Flags.hasNoInfs() || + + // We only need no signed zeros for the 0.25 case. + if ((!Flags.hasNoSignedZeros() && ExponentIs025) || !Flags.hasNoInfs() || !Flags.hasApproximateFuncs()) return SDValue(); @@ -11859,13 +12826,17 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { // Assume that libcalls are the smallest code. // TODO: This restriction should probably be lifted for vectors. - if (DAG.getMachineFunction().getFunction().optForSize()) + if (DAG.getMachineFunction().getFunction().hasOptSize()) return SDValue(); // pow(X, 0.25) --> sqrt(sqrt(X)) SDLoc DL(N); SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0), Flags); - return DAG.getNode(ISD::FSQRT, DL, VT, Sqrt, Flags); + SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt, Flags); + if (ExponentIs025) + return SqrtSqrt; + // pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X)) + return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt, Flags); } return SDValue(); @@ -11911,6 +12882,10 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { EVT VT = N->getValueType(0); EVT OpVT = N0.getValueType(); + // [us]itofp(undef) = 0, because the result value is bounded. + if (N0.isUndef()) + return DAG.getConstantFP(0.0, SDLoc(N), VT); + // fold (sint_to_fp c1) -> c1fp if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && // ...but only if the target supports immediate floating-point values @@ -11968,6 +12943,10 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { EVT VT = N->getValueType(0); EVT OpVT = N0.getValueType(); + // [us]itofp(undef) = 0, because the result value is bounded. + if (N0.isUndef()) + return DAG.getConstantFP(0.0, SDLoc(N), VT); + // fold (uint_to_fp c1) -> c1fp if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && // ...but only if the target supports immediate floating-point values @@ -12051,6 +13030,10 @@ SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + // fold (fp_to_sint undef) -> undef + if (N0.isUndef()) + return DAG.getUNDEF(VT); + // fold (fp_to_sint c1fp) -> c1 if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0); @@ -12062,6 +13045,10 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + // fold (fp_to_uint undef) -> undef + if (N0.isUndef()) + return DAG.getUNDEF(VT); + // fold (fp_to_uint c1fp) -> c1 if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0); @@ -12250,8 +13237,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0); if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(), - &DAG.getTarget().Options)) - return GetNegatedExpression(N0, DAG, LegalOperations); + &DAG.getTarget().Options, ForCodeSize)) + return GetNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading // constant pool values. @@ -12287,7 +13274,7 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { APFloat CVal = CFP1->getValueAPF(); CVal.changeSign(); if (Level >= AfterLegalizeDAG && - (TLI.isFPImmLegal(CVal, VT) || + (TLI.isFPImmLegal(CVal, VT, ForCodeSize) || TLI.isOperationLegal(ISD::ConstantFP, VT))) return DAG.getNode( ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), @@ -12556,6 +13543,7 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, TargetLowering::AddrMode AM; if (N->getOpcode() == ISD::ADD) { + AM.HasBaseReg = true; ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (Offset) // [reg +/- imm] @@ -12564,6 +13552,7 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, // [reg +/- reg] AM.Scale = 1; } else if (N->getOpcode() == ISD::SUB) { + AM.HasBaseReg = true; ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (Offset) // [reg +/- imm] @@ -12653,7 +13642,13 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { // Check #2. if (!isLoad) { SDValue Val = cast<StoreSDNode>(N)->getValue(); - if (Val == BasePtr || BasePtr.getNode()->isPredecessorOf(Val.getNode())) + + // Would require a copy. + if (Val == BasePtr) + return false; + + // Would create a cycle. + if (Val == Ptr || Ptr->isPredecessorOf(Val.getNode())) return false; } @@ -13190,7 +14185,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { if (LD->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes. - SDValue BetterChain = FindBetterChain(N, Chain); + SDValue BetterChain = FindBetterChain(LD, Chain); // If there is a better chain. if (Chain != BetterChain) { @@ -13378,7 +14373,7 @@ struct LoadedSlice { /// Get the alignment of the load used for this slice. unsigned getAlignment() const { unsigned Alignment = Origin->getAlignment(); - unsigned Offset = getOffsetFromBase(); + uint64_t Offset = getOffsetFromBase(); if (Offset != 0) Alignment = MinAlign(Alignment, Alignment + Offset); return Alignment; @@ -13500,9 +14495,11 @@ struct LoadedSlice { assert(DAG && "Missing context"); const TargetLowering &TLI = DAG->getTargetLoweringInfo(); EVT ResVT = Use->getValueType(0); - const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT()); + const TargetRegisterClass *ResRC = + TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent()); const TargetRegisterClass *ArgRC = - TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT()); + TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(), + Use->getOperand(0)->isDivergent()); if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT)) return false; @@ -13826,7 +14823,7 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result; // For narrowing to be valid, it must be the case that the load the - // immediately preceeding memory operation before the store. + // immediately preceding memory operation before the store. if (LD == Chain.getNode()) ; // ok. else if (Chain->getOpcode() == ISD::TokenFactor && @@ -14039,11 +15036,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { /// load / store operations if the target deems the transformation profitable. SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { StoreSDNode *ST = cast<StoreSDNode>(N); - SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) && - Value.hasOneUse() && - Chain == SDValue(Value.getNode(), 1)) { + Value.hasOneUse()) { LoadSDNode *LD = cast<LoadSDNode>(Value); EVT VT = LD->getMemoryVT(); if (!VT.isFloatingPoint() || @@ -14073,7 +15068,7 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { LD->getPointerInfo(), LDAlign); SDValue NewST = - DAG.getStore(NewLD.getValue(1), SDLoc(N), NewLD, ST->getBasePtr(), + DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(), ST->getPointerInfo(), STAlign); AddToWorklist(NewLD.getNode()); @@ -14171,14 +15166,14 @@ SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, Visited.insert(StoreNodes[i].MemNode); } - // don't include nodes that are children + // don't include nodes that are children or repeated nodes. for (unsigned i = 0; i < NumStores; ++i) { - if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0) + if (Visited.insert(StoreNodes[i].MemNode->getChain().getNode()).second) Chains.push_back(StoreNodes[i].MemNode->getChain()); } assert(Chains.size() > 0 && "Chain should have generated a chain"); - return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains); + return DAG.getTokenFactor(StoreDL, Chains); } bool DAGCombiner::MergeStoresOfConstantsOrVecElts( @@ -14372,15 +15367,19 @@ void DAGCombiner::getStoreMergeCandidates( // Loads must only have one use. if (!Ld->hasNUsesOfValue(1, 0)) return; - // The memory operands must not be volatile. + // The memory operands must not be volatile/indexed. if (Ld->isVolatile() || Ld->isIndexed()) return; } auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { + // The memory operands must not be volatile/indexed. if (Other->isVolatile() || Other->isIndexed()) return false; - SDValue Val = peekThroughBitcasts(Other->getValue()); + // Don't mix temporal stores with non-temporal stores. + if (St->isNonTemporal() != Other->isNonTemporal()) + return false; + SDValue OtherBC = peekThroughBitcasts(Other->getValue()); // Allow merging constants of different types as integers. bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) : Other->getMemoryVT() != MemVT; @@ -14388,16 +15387,19 @@ void DAGCombiner::getStoreMergeCandidates( if (NoTypeMatch) return false; // The Load's Base Ptr must also match - if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Val)) { - auto LPtr = BaseIndexOffset::match(OtherLd, DAG); + if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(OtherBC)) { + BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG); if (LoadVT != OtherLd->getMemoryVT()) return false; // Loads must only have one use. if (!OtherLd->hasNUsesOfValue(1, 0)) return false; - // The memory operands must not be volatile. + // The memory operands must not be volatile/indexed. if (OtherLd->isVolatile() || OtherLd->isIndexed()) return false; + // Don't mix temporal loads with non-temporal loads. + if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal()) + return false; if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) return false; } else @@ -14406,17 +15408,17 @@ void DAGCombiner::getStoreMergeCandidates( if (IsConstantSrc) { if (NoTypeMatch) return false; - if (!(isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val))) + if (!(isa<ConstantSDNode>(OtherBC) || isa<ConstantFPSDNode>(OtherBC))) return false; } if (IsExtractVecSrc) { // Do not merge truncated stores here. if (Other->isTruncatingStore()) return false; - if (!MemVT.bitsEq(Val.getValueType())) + if (!MemVT.bitsEq(OtherBC.getValueType())) return false; - if (Val.getOpcode() != ISD::EXTRACT_VECTOR_ELT && - Val.getOpcode() != ISD::EXTRACT_SUBVECTOR) + if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT && + OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR) return false; } Ptr = BaseIndexOffset::match(Other, DAG); @@ -14441,9 +15443,11 @@ void DAGCombiner::getStoreMergeCandidates( RootNode = St->getChain().getNode(); + unsigned NumNodesExplored = 0; if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) { RootNode = Ldn->getChain().getNode(); - for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) + for (auto I = RootNode->use_begin(), E = RootNode->use_end(); + I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored) if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) if (I2.getOperandNo() == 0) @@ -14454,7 +15458,8 @@ void DAGCombiner::getStoreMergeCandidates( StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); } } else - for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) + for (auto I = RootNode->use_begin(), E = RootNode->use_end(); + I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored) if (I.getOperandNo() == 0) if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { BaseIndexOffset Ptr; @@ -14551,6 +15556,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { isa<ConstantFPSDNode>(StoredVal); bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR); + bool IsNonTemporalStore = St->isNonTemporal(); + bool IsNonTemporalLoad = + IsLoadSrc && cast<LoadSDNode>(StoredVal)->isNonTemporal(); if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) return false; @@ -14652,8 +15660,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (TLI.isTypeLegal(StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFast) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, + *FirstInChain->getMemOperand(), &IsFast) && IsFast) { LastIntegerTrunc = false; LastLegalType = i + 1; @@ -14664,8 +15672,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFast) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, + *FirstInChain->getMemOperand(), + &IsFast) && IsFast) { LastIntegerTrunc = true; LastLegalType = i + 1; @@ -14683,8 +15692,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && - TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign, &IsFast) && + TLI.allowsMemoryAccess( + Context, DL, Ty, *FirstInChain->getMemOperand(), &IsFast) && IsFast) LastLegalVectorType = i + 1; } @@ -14755,8 +15764,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && - TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign, &IsFast) && + TLI.allowsMemoryAccess(Context, DL, Ty, + *FirstInChain->getMemOperand(), &IsFast) && IsFast) NumStoresToMerge = i + 1; } @@ -14847,7 +15856,6 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { unsigned FirstStoreAS = FirstInChain->getAddressSpace(); unsigned FirstStoreAlign = FirstInChain->getAlignment(); LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode); - unsigned FirstLoadAS = FirstLoad->getAddressSpace(); unsigned FirstLoadAlign = FirstLoad->getAlignment(); // Scan the memory operations on the chain and find the first @@ -14887,11 +15895,11 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { bool IsFastSt, IsFastLd; if (TLI.isTypeLegal(StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFastSt) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, + *FirstInChain->getMemOperand(), &IsFastSt) && IsFastSt && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign, &IsFastLd) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, + *FirstLoad->getMemOperand(), &IsFastLd) && IsFastLd) { LastLegalVectorType = i + 1; } @@ -14901,11 +15909,11 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { StoreTy = EVT::getIntegerVT(Context, SizeInBits); if (TLI.isTypeLegal(StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFastSt) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, + *FirstInChain->getMemOperand(), &IsFastSt) && IsFastSt && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign, &IsFastLd) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, + *FirstLoad->getMemOperand(), &IsFastLd) && IsFastLd) { LastLegalIntegerType = i + 1; DoIntegerTruncate = false; @@ -14920,11 +15928,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) && TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign, &IsFastSt) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, + *FirstInChain->getMemOperand(), + &IsFastSt) && IsFastSt && - TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign, &IsFastLd) && + TLI.allowsMemoryAccess(Context, DL, StoreTy, + *FirstLoad->getMemOperand(), &IsFastLd) && IsFastLd) { LastLegalIntegerType = i + 1; DoIntegerTruncate = true; @@ -14994,26 +16003,32 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); AddToWorklist(NewStoreChain.getNode()); - MachineMemOperand::Flags MMOFlags = + MachineMemOperand::Flags LdMMOFlags = isDereferenceable ? MachineMemOperand::MODereferenceable : MachineMemOperand::MONone; + if (IsNonTemporalLoad) + LdMMOFlags |= MachineMemOperand::MONonTemporal; + + MachineMemOperand::Flags StMMOFlags = + IsNonTemporalStore ? MachineMemOperand::MONonTemporal + : MachineMemOperand::MONone; SDValue NewLoad, NewStore; if (UseVectorTy || !DoIntegerTruncate) { NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), - FirstLoadAlign, MMOFlags); + FirstLoadAlign, LdMMOFlags); NewStore = DAG.getStore( NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), FirstStoreAlign); + FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags); } else { // This must be the truncstore/extload case EVT ExtendedTy = TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), JointMemOpVT, - FirstLoadAlign, MMOFlags); + FirstLoadAlign, LdMMOFlags); NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), @@ -15168,16 +16183,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { // illegal type. if (((!LegalOperations && !ST->isVolatile()) || TLI.isOperationLegal(ISD::STORE, SVT)) && - TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) { - unsigned OrigAlign = ST->getAlignment(); - bool Fast = false; - if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT, - ST->getAddressSpace(), OrigAlign, &Fast) && - Fast) { - return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, - ST->getPointerInfo(), OrigAlign, - ST->getMemOperand()->getFlags(), ST->getAAInfo()); - } + TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT, + DAG, *ST->getMemOperand())) { + return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, + ST->getPointerInfo(), ST->getAlignment(), + ST->getMemOperand()->getFlags(), ST->getAAInfo()); } } @@ -15205,6 +16215,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (SDValue NewST = TransformFPLoadStorePair(N)) return NewST; + // Try transforming several stores into STORE (BSWAP). + if (SDValue Store = MatchStoreCombine(ST)) + return Store; + if (ST->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes, on this store and any // adjacent stores. @@ -15221,23 +16235,22 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { Value.getValueType().isInteger() && (!isa<ConstantSDNode>(Value) || !cast<ConstantSDNode>(Value)->isOpaque())) { + APInt TruncDemandedBits = + APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), + ST->getMemoryVT().getScalarSizeInBits()); + // See if we can simplify the input to this truncstore with knowledge that // only the low bits are being used. For example: // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" - SDValue Shorter = DAG.GetDemandedBits( - Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), - ST->getMemoryVT().getScalarSizeInBits())); + SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits); AddToWorklist(Value.getNode()); - if (Shorter.getNode()) - return DAG.getTruncStore(Chain, SDLoc(N), Shorter, - Ptr, ST->getMemoryVT(), ST->getMemOperand()); + if (Shorter) + return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(), + ST->getMemOperand()); // Otherwise, see if we can simplify the operation with // SimplifyDemandedBits, which only works if the value has a single use. - if (SimplifyDemandedBits( - Value, - APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), - ST->getMemoryVT().getScalarSizeInBits()))) { + if (SimplifyDemandedBits(Value, TruncDemandedBits)) { // Re-visit the store if anything changed and the store hasn't been merged // with another node (N is deleted) SimplifyDemandedBits will add Value's // node back to the worklist if necessary, but we also need to re-visit @@ -15263,25 +16276,55 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) { if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() && - !ST1->isVolatile() && ST1->getBasePtr() == Ptr && - ST->getMemoryVT() == ST1->getMemoryVT()) { - // If this is a store followed by a store with the same value to the same - // location, then the store is dead/noop. - if (ST1->getValue() == Value) { - // The store is dead, remove it. + !ST1->isVolatile()) { + if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value && + ST->getMemoryVT() == ST1->getMemoryVT()) { + // If this is a store followed by a store with the same value to the + // same location, then the store is dead/noop. return Chain; } - // If this is a store who's preceeding store to the same location - // and no one other node is chained to that store we can effectively - // drop the store. Do not remove stores to undef as they may be used as - // data sinks. if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && !ST1->getBasePtr().isUndef()) { - // ST1 is fully overwritten and can be elided. Combine with it's chain - // value. - CombineTo(ST1, ST1->getChain()); - return SDValue(); + const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG); + const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG); + unsigned STBitSize = ST->getMemoryVT().getSizeInBits(); + unsigned ChainBitSize = ST1->getMemoryVT().getSizeInBits(); + // If this is a store who's preceding store to a subset of the current + // location and no one other node is chained to that store we can + // effectively drop the store. Do not remove stores to undef as they may + // be used as data sinks. + if (STBase.contains(DAG, STBitSize, ChainBase, ChainBitSize)) { + CombineTo(ST1, ST1->getChain()); + return SDValue(); + } + + // If ST stores to a subset of preceding store's write set, we may be + // able to fold ST's value into the preceding stored value. As we know + // the other uses of ST1's chain are unconcerned with ST, this folding + // will not affect those nodes. + int64_t BitOffset; + if (ChainBase.contains(DAG, ChainBitSize, STBase, STBitSize, + BitOffset)) { + SDValue ChainValue = ST1->getValue(); + if (auto *C1 = dyn_cast<ConstantSDNode>(ChainValue)) { + if (auto *C = dyn_cast<ConstantSDNode>(Value)) { + APInt Val = C1->getAPIntValue(); + APInt InsertVal = C->getAPIntValue().zextOrTrunc(STBitSize); + // FIXME: Handle Big-endian mode. + if (!DAG.getDataLayout().isBigEndian()) { + Val.insertBits(InsertVal, BitOffset); + SDValue NewSDVal = + DAG.getConstant(Val, SDLoc(C), ChainValue.getValueType(), + C1->isTargetOpcode(), C1->isOpaque()); + SDNode *NewST1 = DAG.UpdateNodeOperands( + ST1, ST1->getChain(), NewSDVal, ST1->getOperand(2), + ST1->getOperand(3)); + return CombineTo(ST, SDValue(NewST1, 0)); + } + } + } + } // End ST subset of ST1 case. } } } @@ -15299,7 +16342,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { // Always perform this optimization before types are legal. If the target // prefers, also try this after legalization to catch stores that were created // by intrinsics or other nodes. - if (!LegalTypes || (TLI.mergeStoresAfterLegalization())) { + if (!LegalTypes || (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) { while (true) { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so @@ -15333,6 +16376,54 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { return ReduceLoadOpStoreWidth(N); } +SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) { + const auto *LifetimeEnd = cast<LifetimeSDNode>(N); + if (!LifetimeEnd->hasOffset()) + return SDValue(); + + const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(), + LifetimeEnd->getOffset(), false); + + // We walk up the chains to find stores. + SmallVector<SDValue, 8> Chains = {N->getOperand(0)}; + while (!Chains.empty()) { + SDValue Chain = Chains.back(); + Chains.pop_back(); + if (!Chain.hasOneUse()) + continue; + switch (Chain.getOpcode()) { + case ISD::TokenFactor: + for (unsigned Nops = Chain.getNumOperands(); Nops;) + Chains.push_back(Chain.getOperand(--Nops)); + break; + case ISD::LIFETIME_START: + case ISD::LIFETIME_END: + // We can forward past any lifetime start/end that can be proven not to + // alias the node. + if (!isAlias(Chain.getNode(), N)) + Chains.push_back(Chain.getOperand(0)); + break; + case ISD::STORE: { + StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain); + if (ST->isVolatile() || ST->isIndexed()) + continue; + const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG); + // If we store purely within object bounds just before its lifetime ends, + // we can remove the store. + if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase, + ST->getMemoryVT().getStoreSizeInBits())) { + LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump(); + dbgs() << "\nwithin LIFETIME_END of : "; + LifetimeEndBase.dump(); dbgs() << "\n"); + CombineTo(ST, ST->getChain()); + return SDValue(N, 0); + } + } + } + } + return SDValue(); +} + /// For the instruction sequence of store below, F and I values /// are bundled together as an i64 value before being stored into memory. /// Sometimes it is more efficent to generate separate stores for F and I, @@ -15616,7 +16707,9 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, Offset = DAG.getNode( ISD::MUL, DL, PtrType, Offset, DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType)); - MPI = OriginalLoad->getPointerInfo(); + // Discard the pointer info except the address space because the memory + // operand can't represent this new access since the offset is variable. + MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace()); } NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, NewPtr, Offset); @@ -15668,14 +16761,15 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, /// the math/logic after an extract element of a vector. static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG, bool LegalOperations) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Vec = ExtElt->getOperand(0); SDValue Index = ExtElt->getOperand(1); auto *IndexC = dyn_cast<ConstantSDNode>(Index); - if (!IndexC || !ISD::isBinaryOp(Vec.getNode()) || !Vec.hasOneUse()) + if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() || + Vec.getNode()->getNumValues() != 1) return SDValue(); // Targets may want to avoid this to prevent an expensive register transfer. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.shouldScalarizeBinop(Vec)) return SDValue(); @@ -16073,7 +17167,7 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) { SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef<int> VectorMask, SDValue VecIn1, SDValue VecIn2, - unsigned LeftIdx) { + unsigned LeftIdx, bool DidSplitVec) { MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy); @@ -16081,17 +17175,12 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, EVT InVT1 = VecIn1.getValueType(); EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; - unsigned Vec2Offset = 0; unsigned NumElems = VT.getVectorNumElements(); unsigned ShuffleNumElems = NumElems; - // In case both the input vectors are extracted from same base - // vector we do not need extra addend (Vec2Offset) while - // computing shuffle mask. - if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) || - !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) || - !(VecIn1.getOperand(0) == VecIn2.getOperand(0))) - Vec2Offset = InVT1.getVectorNumElements(); + // If we artificially split a vector in two already, then the offsets in the + // operands will all be based off of VecIn1, even those in VecIn2. + unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements(); // We can't generate a shuffle node with mismatched input and output types. // Try to make the types match the type of the output. @@ -16214,23 +17303,29 @@ static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) { // The build vector contains some number of undef elements and exactly // one other element. That other element must be a zero-extended scalar // extracted from a vector at a constant index to turn this into a shuffle. + // Also, require that the build vector does not implicitly truncate/extend + // its elements. // TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND. + EVT VT = BV->getValueType(0); SDValue Zext = BV->getOperand(ZextElt); if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() || Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1))) + !isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)) || + Zext.getValueSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); - // The zero-extend must be a multiple of the source size. + // The zero-extend must be a multiple of the source size, and we must be + // building a vector of the same size as the source of the extract element. SDValue Extract = Zext.getOperand(0); unsigned DestSize = Zext.getValueSizeInBits(); unsigned SrcSize = Extract.getValueSizeInBits(); - if (DestSize % SrcSize != 0) + if (DestSize % SrcSize != 0 || + Extract.getOperand(0).getValueSizeInBits() != VT.getSizeInBits()) return SDValue(); // Create a shuffle mask that will combine the extracted element with zeros // and undefs. - int ZextRatio = DestSize / SrcSize; + int ZextRatio = DestSize / SrcSize; int NumMaskElts = NumBVOps * ZextRatio; SmallVector<int, 32> ShufMask(NumMaskElts, -1); for (int i = 0; i != NumMaskElts; ++i) { @@ -16260,7 +17355,7 @@ static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) { SDValue ZeroVec = DAG.getConstant(0, DL, VecVT); SDValue Shuf = DAG.getVectorShuffle(VecVT, DL, Extract.getOperand(0), ZeroVec, ShufMask); - return DAG.getBitcast(BV->getValueType(0), Shuf); + return DAG.getBitcast(VT, Shuf); } // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT @@ -16316,7 +17411,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { return SDValue(); SDValue ExtractedFromVec = Op.getOperand(0); - APInt ExtractIdx = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + const APInt &ExtractIdx = Op.getConstantOperandAPInt(1); if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements())) return SDValue(); @@ -16344,6 +17439,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // vector, then split the vector efficiently based on the maximum // vector access index and adjust the VectorMask and // VecIn accordingly. + bool DidSplitVec = false; if (VecIn.size() == 2) { unsigned MaxIndex = 0; unsigned NearestPow2 = 0; @@ -16374,6 +17470,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { VecIn.pop_back(); VecIn.push_back(VecIn1); VecIn.push_back(VecIn2); + DidSplitVec = true; for (unsigned i = 0; i < NumElems; i++) { if (VectorMask[i] <= 0) @@ -16411,7 +17508,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue(); if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft, - VecRight, LeftIdx)) + VecRight, LeftIdx, DidSplitVec)) Shuffles.push_back(Shuffle); else return SDValue(); @@ -16477,18 +17574,20 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // Try to turn a build vector of zero extends of extract vector elts into a // a vector zero extend and possibly an extract subvector. -// TODO: Support sign extend or any extend? +// TODO: Support sign extend? // TODO: Allow undef elements? -// TODO: Don't require the extracts to start at element 0. SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) { if (LegalOperations) return SDValue(); EVT VT = N->getValueType(0); + bool FoundZeroExtend = false; SDValue Op0 = N->getOperand(0); auto checkElem = [&](SDValue Op) -> int64_t { - if (Op.getOpcode() == ISD::ZERO_EXTEND && + unsigned Opc = Op.getOpcode(); + FoundZeroExtend |= (Opc == ISD::ZERO_EXTEND); + if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND) && Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0)) if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1))) @@ -16520,7 +17619,8 @@ SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) { SDLoc DL(N); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In, Op0.getOperand(0).getOperand(1)); - return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, In); + return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL, + VT, In); } SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { @@ -16885,14 +17985,14 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return SDValue(); } - unsigned IdentityIndex = i * PartNumElem; - ConstantSDNode *CS = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + auto *CS = dyn_cast<ConstantSDNode>(Op.getOperand(1)); // The extract index must be constant. if (!CS) return SDValue(); // Check that we are reading from the identity index. - if (CS->getZExtValue() != IdentityIndex) + unsigned IdentityIndex = i * PartNumElem; + if (CS->getAPIntValue() != IdentityIndex) return SDValue(); } @@ -16902,12 +18002,59 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return SDValue(); } +static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract, + SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue BinOp = Extract->getOperand(0); + unsigned BinOpcode = BinOp.getOpcode(); + if (!TLI.isBinOp(BinOpcode) || BinOp.getNode()->getNumValues() != 1) + return SDValue(); + + SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1); + SDValue Index = Extract->getOperand(1); + EVT VT = Extract->getValueType(0); + + // Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find + // if the source subvector is the same type as the one being extracted. + auto GetSubVector = [VT, Index](SDValue V) -> SDValue { + if (V.getOpcode() == ISD::INSERT_SUBVECTOR && + V.getOperand(1).getValueType() == VT && V.getOperand(2) == Index) { + return V.getOperand(1); + } + auto *IndexC = dyn_cast<ConstantSDNode>(Index); + if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS && + V.getOperand(0).getValueType() == VT && + (IndexC->getZExtValue() % VT.getVectorNumElements()) == 0) { + uint64_t SubIdx = IndexC->getZExtValue() / VT.getVectorNumElements(); + return V.getOperand(SubIdx); + } + return SDValue(); + }; + SDValue Sub0 = GetSubVector(Bop0); + SDValue Sub1 = GetSubVector(Bop1); + + // TODO: We could handle the case where only 1 operand is being inserted by + // creating an extract of the other operand, but that requires checking + // number of uses and/or costs. + if (!Sub0 || !Sub1 || !TLI.isOperationLegalOrCustom(BinOpcode, VT)) + return SDValue(); + + // We are inserting both operands of the wide binop only to extract back + // to the narrow vector size. Eliminate all of the insert/extract: + // ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y + return DAG.getNode(BinOpcode, SDLoc(Extract), VT, Sub0, Sub1, + BinOp->getFlags()); +} + /// If we are extracting a subvector produced by a wide binary operator try /// to use a narrow binary operator and/or avoid concatenation and extraction. static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share // some of these bailouts with other transforms. + if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG)) + return V; + // The extract index must be a constant, so we can map it to a concat operand. auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1)); if (!ExtractIndexC) @@ -16915,8 +18062,10 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // We are looking for an optionally bitcasted wide vector binary operator // feeding an extract subvector. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0)); - if (!ISD::isBinaryOp(BinOp.getNode())) + unsigned BOpcode = BinOp.getOpcode(); + if (!TLI.isBinOp(BOpcode) || BinOp.getNode()->getNumValues() != 1) return SDValue(); // The binop must be a vector type, so we can extract some fraction of it. @@ -16945,8 +18094,6 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // Bail out if the target does not support a narrower version of the binop. EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), WideNumElts / NarrowingRatio); - unsigned BOpcode = BinOp.getOpcode(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) return SDValue(); @@ -16986,35 +18133,35 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // We need at least one concatenation operation of a binop operand to make // this transform worthwhile. The concat must double the input vector sizes. - // TODO: Should we also handle INSERT_SUBVECTOR patterns? - SDValue LHS = peekThroughBitcasts(BinOp.getOperand(0)); - SDValue RHS = peekThroughBitcasts(BinOp.getOperand(1)); - bool ConcatL = - LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2; - bool ConcatR = - RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2; - if (!ConcatL && !ConcatR) + auto GetSubVector = [ConcatOpNum](SDValue V) -> SDValue { + if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2) + return V.getOperand(ConcatOpNum); return SDValue(); + }; + SDValue SubVecL = GetSubVector(peekThroughBitcasts(BinOp.getOperand(0))); + SDValue SubVecR = GetSubVector(peekThroughBitcasts(BinOp.getOperand(1))); + + if (SubVecL || SubVecR) { + // If a binop operand was not the result of a concat, we must extract a + // half-sized operand for our new narrow binop: + // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN + // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC) + // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN + SDLoc DL(Extract); + SDValue IndexC = DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT); + SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, + BinOp.getOperand(0), IndexC); - // If one of the binop operands was not the result of a concat, we must - // extract a half-sized operand for our new narrow binop. - SDLoc DL(Extract); - - // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN - // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N) - // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN - SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum)) - : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, - BinOp.getOperand(0), - DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); + SDValue Y = SubVecR ? DAG.getBitcast(NarrowBVT, SubVecR) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, + BinOp.getOperand(1), IndexC); - SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum)) - : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, - BinOp.getOperand(1), - DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); + SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); + return DAG.getBitcast(VT, NarrowBinOp); + } - SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); - return DAG.getBitcast(VT, NarrowBinOp); + return SDValue(); } /// If we are extracting a subvector from a wide vector load, convert to a @@ -17052,7 +18199,7 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { return NewLd; } -SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { +SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); @@ -17064,14 +18211,51 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG)) return NarrowLoad; + // Combine an extract of an extract into a single extract_subvector. + // ext (ext X, C), 0 --> ext X, C + SDValue Index = N->getOperand(1); + if (isNullConstant(Index) && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && + V.hasOneUse() && isa<ConstantSDNode>(V.getOperand(1))) { + if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(), + V.getConstantOperandVal(1)) && + TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) { + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, V.getOperand(0), + V.getOperand(1)); + } + } + + // Try to move vector bitcast after extract_subv by scaling extraction index: + // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') + if (isa<ConstantSDNode>(Index) && V.getOpcode() == ISD::BITCAST && + V.getOperand(0).getValueType().isVector()) { + SDValue SrcOp = V.getOperand(0); + EVT SrcVT = SrcOp.getValueType(); + unsigned SrcNumElts = SrcVT.getVectorNumElements(); + unsigned DestNumElts = V.getValueType().getVectorNumElements(); + if ((SrcNumElts % DestNumElts) == 0) { + unsigned SrcDestRatio = SrcNumElts / DestNumElts; + unsigned NewExtNumElts = NVT.getVectorNumElements() * SrcDestRatio; + EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), + NewExtNumElts); + if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { + unsigned IndexValScaled = N->getConstantOperandVal(1) * SrcDestRatio; + SDLoc DL(N); + SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, + V.getOperand(0), NewIndex); + return DAG.getBitcast(NVT, NewExtract); + } + } + // TODO - handle (DestNumElts % SrcNumElts) == 0 + } + // Combine: // (extract_subvec (concat V1, V2, ...), i) // Into: // Vi if possible // Only operand 0 is checked as 'concat' assumes all inputs of the same // type. - if (V.getOpcode() == ISD::CONCAT_VECTORS && - isa<ConstantSDNode>(N->getOperand(1)) && + if (V.getOpcode() == ISD::CONCAT_VECTORS && isa<ConstantSDNode>(Index) && V.getOperand(0).getValueType() == NVT) { unsigned Idx = N->getConstantOperandVal(1); unsigned NumElems = NVT.getVectorNumElements(); @@ -17084,7 +18268,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { // If the input is a build vector. Try to make a smaller build vector. if (V.getOpcode() == ISD::BUILD_VECTOR) { - if (auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + if (auto *IdxC = dyn_cast<ConstantSDNode>(Index)) { EVT InVT = V.getValueType(); unsigned ExtractSize = NVT.getSizeInBits(); unsigned EltSize = InVT.getScalarSizeInBits(); @@ -17092,26 +18276,27 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { if (ExtractSize % EltSize == 0) { unsigned NumElems = ExtractSize / EltSize; EVT EltVT = InVT.getVectorElementType(); - EVT ExtractVT = NumElems == 1 ? EltVT : - EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems); + EVT ExtractVT = NumElems == 1 ? EltVT + : EVT::getVectorVT(*DAG.getContext(), + EltVT, NumElems); if ((Level < AfterLegalizeDAG || (NumElems == 1 || TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) && (!LegalTypes || TLI.isTypeLegal(ExtractVT))) { - unsigned IdxVal = (Idx->getZExtValue() * NVT.getScalarSizeInBits()) / - EltSize; + unsigned IdxVal = IdxC->getZExtValue(); + IdxVal *= NVT.getScalarSizeInBits(); + IdxVal /= EltSize; + if (NumElems == 1) { SDValue Src = V->getOperand(IdxVal); if (EltVT != Src.getValueType()) Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src); - return DAG.getBitcast(NVT, Src); } // Extract the pieces from the original build_vector. - SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N), - makeArrayRef(V->op_begin() + IdxVal, - NumElems)); + SDValue BuildVec = DAG.getBuildVector( + ExtractVT, SDLoc(N), V->ops().slice(IdxVal, NumElems)); return DAG.getBitcast(NVT, BuildVec); } } @@ -17126,9 +18311,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { return SDValue(); // Only handle cases where both indexes are constants. - auto *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1)); + auto *ExtIdx = dyn_cast<ConstantSDNode>(Index); auto *InsIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); - if (InsIdx && ExtIdx) { // Combine: // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) @@ -17141,7 +18325,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { return DAG.getNode( ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)), - N->getOperand(1)); + Index); } } @@ -17154,6 +18338,53 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { return SDValue(); } +/// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles +/// followed by concatenation. Narrow vector ops may have better performance +/// than wide ops, and this can unlock further narrowing of other vector ops. +/// Targets can invert this transform later if it is not profitable. +static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf, + SelectionDAG &DAG) { + SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1); + if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || + N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 || + !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef()) + return SDValue(); + + // Split the wide shuffle mask into halves. Any mask element that is accessing + // operand 1 is offset down to account for narrowing of the vectors. + ArrayRef<int> Mask = Shuf->getMask(); + EVT VT = Shuf->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + SmallVector<int, 16> Mask0(HalfNumElts, -1); + SmallVector<int, 16> Mask1(HalfNumElts, -1); + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] == -1) + continue; + int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts; + if (i < HalfNumElts) + Mask0[i] = M; + else + Mask1[i - HalfNumElts] = M; + } + + // Ask the target if this is a valid transform. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + HalfNumElts); + if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) || + !TLI.isShuffleMaskLegal(Mask1, HalfVT)) + return SDValue(); + + // shuffle (concat X, undef), (concat Y, undef), Mask --> + // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1) + SDValue X = N0.getOperand(0), Y = N1.getOperand(0); + SDLoc DL(Shuf); + SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0); + SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1); +} + // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, // or turn a shuffle of a single concat into simpler shuffle then concat. static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { @@ -17163,20 +18394,24 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); + ArrayRef<int> Mask = SVN->getMask(); SmallVector<SDValue, 4> Ops; EVT ConcatVT = N0.getOperand(0).getValueType(); unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements(); unsigned NumConcats = NumElts / NumElemsPerConcat; + auto IsUndefMaskElt = [](int i) { return i == -1; }; + // Special case: shuffle(concat(A,B)) can be more efficiently represented // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high // half vector elements. if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() && - std::all_of(SVN->getMask().begin() + NumElemsPerConcat, - SVN->getMask().end(), [](int i) { return i == -1; })) { - N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1), - makeArrayRef(SVN->getMask().begin(), NumElemsPerConcat)); + llvm::all_of(Mask.slice(NumElemsPerConcat, NumElemsPerConcat), + IsUndefMaskElt)) { + N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), + N0.getOperand(1), + Mask.slice(0, NumElemsPerConcat)); N1 = DAG.getUNDEF(ConcatVT); return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1); } @@ -17184,35 +18419,32 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { // Look at every vector that's inserted. We're looking for exact // subvector-sized copies from a concatenated vector for (unsigned I = 0; I != NumConcats; ++I) { - // Make sure we're dealing with a copy. unsigned Begin = I * NumElemsPerConcat; - bool AllUndef = true, NoUndef = true; - for (unsigned J = Begin; J != Begin + NumElemsPerConcat; ++J) { - if (SVN->getMaskElt(J) >= 0) - AllUndef = false; - else - NoUndef = false; + ArrayRef<int> SubMask = Mask.slice(Begin, NumElemsPerConcat); + + // Make sure we're dealing with a copy. + if (llvm::all_of(SubMask, IsUndefMaskElt)) { + Ops.push_back(DAG.getUNDEF(ConcatVT)); + continue; } - if (NoUndef) { - if (SVN->getMaskElt(Begin) % NumElemsPerConcat != 0) + int OpIdx = -1; + for (int i = 0; i != (int)NumElemsPerConcat; ++i) { + if (IsUndefMaskElt(SubMask[i])) + continue; + if ((SubMask[i] % (int)NumElemsPerConcat) != i) return SDValue(); - - for (unsigned J = 1; J != NumElemsPerConcat; ++J) - if (SVN->getMaskElt(Begin + J - 1) + 1 != SVN->getMaskElt(Begin + J)) - return SDValue(); - - unsigned FirstElt = SVN->getMaskElt(Begin) / NumElemsPerConcat; - if (FirstElt < N0.getNumOperands()) - Ops.push_back(N0.getOperand(FirstElt)); - else - Ops.push_back(N1.getOperand(FirstElt - N0.getNumOperands())); - - } else if (AllUndef) { - Ops.push_back(DAG.getUNDEF(N0.getOperand(0).getValueType())); - } else { // Mixed with general masks and undefs, can't do optimization. - return SDValue(); + int EltOpIdx = SubMask[i] / NumElemsPerConcat; + if (0 <= OpIdx && EltOpIdx != OpIdx) + return SDValue(); + OpIdx = EltOpIdx; } + assert(0 <= OpIdx && "Unknown concat_vectors op"); + + if (OpIdx < (int)N0.getNumOperands()) + Ops.push_back(N0.getOperand(OpIdx)); + else + Ops.push_back(N1.getOperand(OpIdx - N0.getNumOperands())); } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); @@ -17278,8 +18510,8 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, if (S.getOpcode() == ISD::BUILD_VECTOR) { Op = S.getOperand(Idx); } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) { - assert(Idx == 0 && "Unexpected SCALAR_TO_VECTOR operand index."); - Op = S.getOperand(0); + SDValue Op0 = S.getOperand(0); + Op = Idx == 0 ? Op0 : DAG.getUNDEF(Op0.getValueType()); } else { // Operand can't be combined - bail out. return SDValue(); @@ -17433,11 +18665,17 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, // If splat-mask contains undef elements, we need to be careful about // introducing undef's in the folded mask which are not the result of composing // the masks of the shuffles. -static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask, - ShuffleVectorSDNode *Splat, - SelectionDAG &DAG) { +static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf, + SelectionDAG &DAG) { + if (!Shuf->getOperand(1).isUndef()) + return SDValue(); + auto *Splat = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0)); + if (!Splat || !Splat->isSplat()) + return SDValue(); + + ArrayRef<int> ShufMask = Shuf->getMask(); ArrayRef<int> SplatMask = Splat->getMask(); - assert(UserMask.size() == SplatMask.size() && "Mask length mismatch"); + assert(ShufMask.size() == SplatMask.size() && "Mask length mismatch"); // Prefer simplifying to the splat-shuffle, if possible. This is legal if // every undef mask element in the splat-shuffle has a corresponding undef @@ -17463,13 +18701,13 @@ static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask, return false; return true; }; - if (CanSimplifyToExistingSplat(UserMask, SplatMask)) - return SDValue(Splat, 0); + if (CanSimplifyToExistingSplat(ShufMask, SplatMask)) + return Shuf->getOperand(0); // Create a new shuffle with a mask that is composed of the two shuffles' // masks. SmallVector<int, 32> NewMask; - for (int Idx : UserMask) + for (int Idx : ShufMask) NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), @@ -17555,6 +18793,34 @@ static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, Op1, Op0.getOperand(1), NewInsIndex); } +/// If we have a unary shuffle of a shuffle, see if it can be folded away +/// completely. This has the potential to lose undef knowledge because the first +/// shuffle may not have an undef mask element where the second one does. So +/// only call this after doing simplifications based on demanded elements. +static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) { + // shuf (shuf0 X, Y, Mask0), undef, Mask + auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0)); + if (!Shuf0 || !Shuf->getOperand(1).isUndef()) + return SDValue(); + + ArrayRef<int> Mask = Shuf->getMask(); + ArrayRef<int> Mask0 = Shuf0->getMask(); + for (int i = 0, e = (int)Mask.size(); i != e; ++i) { + // Ignore undef elements. + if (Mask[i] == -1) + continue; + assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value"); + + // Is the element of the shuffle operand chosen by this shuffle the same as + // the element chosen by the shuffle operand itself? + if (Mask0[Mask[i]] != Mask0[i]) + return SDValue(); + } + // Every element of this shuffle is identical to the result of the previous + // shuffle, so we can replace this value. + return Shuf->getOperand(0); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -17604,19 +18870,35 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG)) return InsElt; - // A shuffle of a single vector that is a splat can always be folded. - if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0)) - if (N1->isUndef() && N0Shuf->isSplat()) - return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG); + // A shuffle of a single vector that is a splatted value can always be folded. + if (SDValue V = combineShuffleOfSplatVal(SVN, DAG)) + return V; // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { - SDNode *V = N0.getNode(); + int SplatIndex = SVN->getSplatIndex(); + if (TLI.isExtractVecEltCheap(VT, SplatIndex) && + TLI.isBinOp(N0.getOpcode()) && N0.getNode()->getNumValues() == 1) { + // splat (vector_bo L, R), Index --> + // splat (scalar_bo (extelt L, Index), (extelt R, Index)) + SDValue L = N0.getOperand(0), R = N0.getOperand(1); + SDLoc DL(N); + EVT EltVT = VT.getScalarType(); + SDValue Index = DAG.getIntPtrConstant(SplatIndex, DL); + SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index); + SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index); + SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR, + N0.getNode()->getFlags()); + SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO); + SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0); + return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask); + } // If this is a bit convert that changes the element type of the vector but // not the number of vector elements, look through it. Be careful not to // look though conversions that change things like v4f32 to v2f64. + SDNode *V = N0.getNode(); if (V->getOpcode() == ISD::BITCAST) { SDValue ConvInput = V->getOperand(0); if (ConvInput.getValueType().isVector() && @@ -17649,7 +18931,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return N0; // Canonicalize any other splat as a build_vector. - const SDValue &Splatted = V->getOperand(SVN->getSplatIndex()); + SDValue Splatted = V->getOperand(SplatIndex); SmallVector<SDValue, 8> Ops(NumElts, Splatted); SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops); @@ -17665,6 +18947,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { if (SimplifyDemandedVectorElts(SDValue(N, 0))) return SDValue(N, 0); + // This is intentionally placed after demanded elements simplification because + // it could eliminate knowledge of undef elements created by this shuffle. + if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN)) + return ShufOp; + // Match shuffles that can be converted to any_vector_extend_in_reg. if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) return V; @@ -17704,7 +18991,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { NewMask.push_back(M < 0 ? -1 : Scale * M + s); return NewMask; }; - + SDValue BC0 = peekThroughOneUseBitcasts(N0); if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) { EVT SVT = VT.getScalarType(); @@ -17884,6 +19171,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask); } + if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG)) + return V; + return SDValue(); } @@ -18006,7 +19296,44 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { if (!isa<ConstantSDNode>(N2)) return SDValue(); - unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue(); + uint64_t InsIdx = cast<ConstantSDNode>(N2)->getZExtValue(); + + // Push subvector bitcasts to the output, adjusting the index as we go. + // insert_subvector(bitcast(v), bitcast(s), c1) + // -> bitcast(insert_subvector(v, s, c2)) + if ((N0.isUndef() || N0.getOpcode() == ISD::BITCAST) && + N1.getOpcode() == ISD::BITCAST) { + SDValue N0Src = peekThroughBitcasts(N0); + SDValue N1Src = peekThroughBitcasts(N1); + EVT N0SrcSVT = N0Src.getValueType().getScalarType(); + EVT N1SrcSVT = N1Src.getValueType().getScalarType(); + if ((N0.isUndef() || N0SrcSVT == N1SrcSVT) && + N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) { + EVT NewVT; + SDLoc DL(N); + SDValue NewIdx; + MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + LLVMContext &Ctx = *DAG.getContext(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) { + unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits(); + NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale); + NewIdx = DAG.getConstant(InsIdx * Scale, DL, IdxVT); + } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) { + unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits; + if ((NumElts % Scale) == 0 && (InsIdx % Scale) == 0) { + NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts / Scale); + NewIdx = DAG.getConstant(InsIdx / Scale, DL, IdxVT); + } + } + if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) { + SDValue Res = DAG.getBitcast(NewVT, N0Src); + Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, Res, N1Src, NewIdx); + return DAG.getBitcast(VT, Res); + } + } + } // Canonicalize insert_subvector dag nodes. // Example: @@ -18070,6 +19397,36 @@ SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitVECREDUCE(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT VT = N0.getValueType(); + unsigned Opcode = N->getOpcode(); + + // VECREDUCE over 1-element vector is just an extract. + if (VT.getVectorNumElements() == 1) { + SDLoc dl(N); + SDValue Res = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + if (Res.getValueType() != N->getValueType(0)) + Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res); + return Res; + } + + // On an boolean vector an and/or reduction is the same as a umin/umax + // reduction. Convert them if the latter is legal while the former isn't. + if (Opcode == ISD::VECREDUCE_AND || Opcode == ISD::VECREDUCE_OR) { + unsigned NewOpcode = Opcode == ISD::VECREDUCE_AND + ? ISD::VECREDUCE_UMIN : ISD::VECREDUCE_UMAX; + if (!TLI.isOperationLegalOrCustom(Opcode, VT) && + TLI.isOperationLegalOrCustom(NewOpcode, VT) && + DAG.ComputeNumSignBits(N0) == VT.getScalarSizeInBits()) + return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0); + } + + return SDValue(); +} + /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle /// with the destination vector and a zero vector. /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> @@ -18161,6 +19518,53 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { return SDValue(); } +/// If a vector binop is performed on splat values, it may be profitable to +/// extract, scalarize, and insert/splat. +static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + unsigned Opcode = N->getOpcode(); + EVT VT = N->getValueType(0); + EVT EltVT = VT.getVectorElementType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // TODO: Remove/replace the extract cost check? If the elements are available + // as scalars, then there may be no extract cost. Should we ask if + // inserting a scalar back into a vector is cheap instead? + int Index0, Index1; + SDValue Src0 = DAG.getSplatSourceVector(N0, Index0); + SDValue Src1 = DAG.getSplatSourceVector(N1, Index1); + if (!Src0 || !Src1 || Index0 != Index1 || + Src0.getValueType().getVectorElementType() != EltVT || + Src1.getValueType().getVectorElementType() != EltVT || + !TLI.isExtractVecEltCheap(VT, Index0) || + !TLI.isOperationLegalOrCustom(Opcode, EltVT)) + return SDValue(); + + SDLoc DL(N); + SDValue IndexC = + DAG.getConstant(Index0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())); + SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, N0, IndexC); + SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, N1, IndexC); + SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()); + + // If all lanes but 1 are undefined, no need to splat the scalar result. + // TODO: Keep track of undefs and use that info in the general case. + if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() && + count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 && + count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) { + // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) --> + // build_vec ..undef, (bo X, Y), undef... + SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT)); + Ops[Index0] = ScalarBO; + return DAG.getBuildVector(VT, DL, Ops); + } + + // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index + SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO); + return DAG.getBuildVector(VT, DL, Ops); +} + /// Visit a binary vector operation, like ADD. SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { assert(N->getValueType(0).isVector() && @@ -18169,34 +19573,63 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Ops[] = {LHS, RHS}; + EVT VT = N->getValueType(0); + unsigned Opcode = N->getOpcode(); // See if we can constant fold the vector operation. if (SDValue Fold = DAG.FoldConstantVectorArithmetic( - N->getOpcode(), SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags())) + Opcode, SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags())) return Fold; - // Type legalization might introduce new shuffles in the DAG. - // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask))) - // -> (shuffle (VBinOp (A, B)), Undef, Mask). - if (LegalTypes && isa<ShuffleVectorSDNode>(LHS) && - isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() && - LHS.getOperand(1).isUndef() && - RHS.getOperand(1).isUndef()) { - ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS); - ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS); - - if (SVN0->getMask().equals(SVN1->getMask())) { - EVT VT = N->getValueType(0); - SDValue UndefVector = LHS.getOperand(1); - SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT, - LHS.getOperand(0), RHS.getOperand(0), - N->getFlags()); - AddUsersToWorklist(N); - return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector, - SVN0->getMask()); + // Move unary shuffles with identical masks after a vector binop: + // VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask)) + // --> shuffle (VBinOp A, B), Undef, Mask + // This does not require type legality checks because we are creating the + // same types of operations that are in the original sequence. We do have to + // restrict ops like integer div that have immediate UB (eg, div-by-zero) + // though. This code is adapted from the identical transform in instcombine. + if (Opcode != ISD::UDIV && Opcode != ISD::SDIV && + Opcode != ISD::UREM && Opcode != ISD::SREM && + Opcode != ISD::UDIVREM && Opcode != ISD::SDIVREM) { + auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS); + auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS); + if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) && + LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() && + (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) { + SDLoc DL(N); + SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0), + RHS.getOperand(0), N->getFlags()); + SDValue UndefV = LHS.getOperand(1); + return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask()); + } + } + + // The following pattern is likely to emerge with vector reduction ops. Moving + // the binary operation ahead of insertion may allow using a narrower vector + // instruction that has better performance than the wide version of the op: + // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z + if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() && + RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() && + LHS.getOperand(2) == RHS.getOperand(2) && + (LHS.hasOneUse() || RHS.hasOneUse())) { + SDValue X = LHS.getOperand(1); + SDValue Y = RHS.getOperand(1); + SDValue Z = LHS.getOperand(2); + EVT NarrowVT = X.getValueType(); + if (NarrowVT == Y.getValueType() && + TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) { + // (binop undef, undef) may not return undef, so compute that result. + SDLoc DL(N); + SDValue VecC = + DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT)); + SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z); } } + if (SDValue V = scalarizeBinOpOfSplats(N, DAG)) + return V; + return SDValue(); } @@ -18214,13 +19647,16 @@ SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, // Check to see if we got a select_cc back (to turn into setcc/select). // Otherwise, just return whatever node we got back, like fabs. if (SCC.getOpcode() == ISD::SELECT_CC) { + const SDNodeFlags Flags = N0.getNode()->getFlags(); SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0), N0.getValueType(), SCC.getOperand(0), SCC.getOperand(1), - SCC.getOperand(4)); + SCC.getOperand(4), Flags); AddToWorklist(SETCC.getNode()); - return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC, - SCC.getOperand(2), SCC.getOperand(3)); + SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC, + SCC.getOperand(2), SCC.getOperand(3)); + SelectNode->setFlags(Flags); + return SelectNode; } return SCC; @@ -18305,6 +19741,10 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, // locations are not in the default address space. LLD->getPointerInfo().getAddrSpace() != 0 || RLD->getPointerInfo().getAddrSpace() != 0 || + // We can't produce a CMOV of a TargetFrameIndex since we won't + // generate the address generation required. + LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex || + RLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex || !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(), LLD->getBasePtr().getValueType())) return false; @@ -18501,8 +19941,8 @@ SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset( // If a constant can be materialized without loads, this does not make sense. if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal || - TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) || - TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) + TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0), ForCodeSize) || + TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0), ForCodeSize)) return SDValue(); // If both constants have multiple uses, then we won't need to do an extra @@ -18547,20 +19987,20 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, if (N2 == N3) return N2; EVT CmpOpVT = N0.getValueType(); + EVT CmpResVT = getSetCCResultType(CmpOpVT); EVT VT = N2.getValueType(); auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode()); auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode()); auto *N3C = dyn_cast<ConstantSDNode>(N3.getNode()); // Determine if the condition we're dealing with is constant. - SDValue SCC = SimplifySetCC(getSetCCResultType(CmpOpVT), N0, N1, CC, DL, - false); - if (SCC.getNode()) AddToWorklist(SCC.getNode()); - - if (auto *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) { - // fold select_cc true, x, y -> x - // fold select_cc false, x, y -> y - return !SCCC->isNullValue() ? N2 : N3; + if (SDValue SCC = DAG.FoldSetCC(CmpResVT, N0, N1, CC, DL)) { + AddToWorklist(SCC.getNode()); + if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC)) { + // fold select_cc true, x, y -> x + // fold select_cc false, x, y -> y + return !(SCCC->isNullValue()) ? N2 : N3; + } } if (SDValue V = @@ -18621,7 +20061,7 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, SDValue Temp, SCC; // zext (setcc n0, n1) if (LegalTypes) { - SCC = DAG.getSetCC(DL, getSetCCResultType(CmpOpVT), N0, N1, CC); + SCC = DAG.getSetCC(DL, CmpResVT, N0, N1, CC); if (VT.bitsLT(SCC.getValueType())) Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT); else @@ -18644,36 +20084,6 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, getShiftAmountTy(Temp.getValueType()))); } - // Check to see if this is an integer abs. - // select_cc setg[te] X, 0, X, -X -> - // select_cc setgt X, -1, X, -X -> - // select_cc setl[te] X, 0, -X, X -> - // select_cc setlt X, 1, -X, X -> - // Y = sra (X, size(X)-1); xor (add (X, Y), Y) - if (N1C) { - ConstantSDNode *SubC = nullptr; - if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) || - (N1C->isAllOnesValue() && CC == ISD::SETGT)) && - N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1)) - SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0)); - else if (((N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE)) || - (N1C->isOne() && CC == ISD::SETLT)) && - N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1)) - SubC = dyn_cast<ConstantSDNode>(N2.getOperand(0)); - - if (SubC && SubC->isNullValue() && CmpOpVT.isInteger()) { - SDLoc DL(N0); - SDValue Shift = DAG.getNode(ISD::SRA, DL, CmpOpVT, N0, - DAG.getConstant(CmpOpVT.getSizeInBits() - 1, - DL, - getShiftAmountTy(CmpOpVT))); - SDValue Add = DAG.getNode(ISD::ADD, DL, CmpOpVT, N0, Shift); - AddToWorklist(Shift.getNode()); - AddToWorklist(Add.getNode()); - return DAG.getNode(ISD::XOR, DL, CmpOpVT, Add, Shift); - } - } - // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X) // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X) // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X) @@ -18728,7 +20138,7 @@ SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, SDValue DAGCombiner::BuildSDIV(SDNode *N) { // when optimising for minimum size, we don't want to expand a div to a mul // and a shift. - if (DAG.getMachineFunction().getFunction().optForMinSize()) + if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); SmallVector<SDNode *, 8> Built; @@ -18769,7 +20179,7 @@ SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) { SDValue DAGCombiner::BuildUDIV(SDNode *N) { // when optimising for minimum size, we don't want to expand a div to a mul // and a shift. - if (DAG.getMachineFunction().getFunction().optForMinSize()) + if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); SmallVector<SDNode *, 8> Built; @@ -18821,7 +20231,6 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) { AddToWorklist(Est.getNode()); if (Iterations) { - EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); @@ -18977,7 +20386,6 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, if (!Reciprocal) { // The estimate is now completely wrong if the input was exactly 0.0 or // possibly a denormal. Force the answer to 0.0 for those cases. - EVT VT = Op.getValueType(); SDLoc DL(Op); EVT CCVT = getSetCCResultType(VT); ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; @@ -19020,79 +20428,95 @@ SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) { } /// Return true if there is any possibility that the two addresses overlap. -bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { - // If they are the same then they must be aliases. - if (Op0->getBasePtr() == Op1->getBasePtr()) return true; +bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const { - // If they are both volatile then they cannot be reordered. - if (Op0->isVolatile() && Op1->isVolatile()) return true; + struct MemUseCharacteristics { + bool IsVolatile; + SDValue BasePtr; + int64_t Offset; + Optional<int64_t> NumBytes; + MachineMemOperand *MMO; + }; - // If one operation reads from invariant memory, and the other may store, they - // cannot alias. These should really be checking the equivalent of mayWrite, - // but it only matters for memory nodes other than load /store. - if (Op0->isInvariant() && Op1->writeMem()) - return false; + auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics { + if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) { + int64_t Offset = 0; + if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset())) + Offset = (LSN->getAddressingMode() == ISD::PRE_INC) + ? C->getSExtValue() + : (LSN->getAddressingMode() == ISD::PRE_DEC) + ? -1 * C->getSExtValue() + : 0; + return {LSN->isVolatile(), LSN->getBasePtr(), Offset /*base offset*/, + Optional<int64_t>(LSN->getMemoryVT().getStoreSize()), + LSN->getMemOperand()}; + } + if (const auto *LN = cast<LifetimeSDNode>(N)) + return {false /*isVolatile*/, LN->getOperand(1), + (LN->hasOffset()) ? LN->getOffset() : 0, + (LN->hasOffset()) ? Optional<int64_t>(LN->getSize()) + : Optional<int64_t>(), + (MachineMemOperand *)nullptr}; + // Default. + return {false /*isvolatile*/, SDValue(), (int64_t)0 /*offset*/, + Optional<int64_t>() /*size*/, (MachineMemOperand *)nullptr}; + }; - if (Op1->isInvariant() && Op0->writeMem()) - return false; + MemUseCharacteristics MUC0 = getCharacteristics(Op0), + MUC1 = getCharacteristics(Op1); - unsigned NumBytes0 = Op0->getMemoryVT().getStoreSize(); - unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize(); - - // Check for BaseIndexOffset matching. - BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0, DAG); - BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1, DAG); - int64_t PtrDiff; - if (BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()) { - if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) - return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0)); - - // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be - // able to calculate their relative offset if at least one arises - // from an alloca. However, these allocas cannot overlap and we - // can infer there is no alias. - if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase())) - if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) { - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - // If the base are the same frame index but the we couldn't find a - // constant offset, (indices are different) be conservative. - if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) || - !MFI.isFixedObjectIndex(B->getIndex()))) - return false; - } + // If they are to the same address, then they must be aliases. + if (MUC0.BasePtr.getNode() && MUC0.BasePtr == MUC1.BasePtr && + MUC0.Offset == MUC1.Offset) + return true; - bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase()); - bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase()); - bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase()); - bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase()); - bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase()); - bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase()); - - // If of mismatched base types or checkable indices we can check - // they do not alias. - if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) || - (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) && - (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1)) + // If they are both volatile then they cannot be reordered. + if (MUC0.IsVolatile && MUC1.IsVolatile) + return true; + + if (MUC0.MMO && MUC1.MMO) { + if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) || + (MUC1.MMO->isInvariant() && MUC0.MMO->isStore())) return false; } + // Try to prove that there is aliasing, or that there is no aliasing. Either + // way, we can return now. If nothing can be proved, proceed with more tests. + bool IsAlias; + if (BaseIndexOffset::computeAliasing(Op0, MUC0.NumBytes, Op1, MUC1.NumBytes, + DAG, IsAlias)) + return IsAlias; + + // The following all rely on MMO0 and MMO1 being valid. Fail conservatively if + // either are not known. + if (!MUC0.MMO || !MUC1.MMO) + return true; + + // If one operation reads from invariant memory, and the other may store, they + // cannot alias. These should really be checking the equivalent of mayWrite, + // but it only matters for memory nodes other than load /store. + if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) || + (MUC1.MMO->isInvariant() && MUC0.MMO->isStore())) + return false; + // If we know required SrcValue1 and SrcValue2 have relatively large // alignment compared to the size and offset of the access, we may be able // to prove they do not alias. This check is conservative for now to catch // cases created by splitting vector types. - int64_t SrcValOffset0 = Op0->getSrcValueOffset(); - int64_t SrcValOffset1 = Op1->getSrcValueOffset(); - unsigned OrigAlignment0 = Op0->getOriginalAlignment(); - unsigned OrigAlignment1 = Op1->getOriginalAlignment(); + int64_t SrcValOffset0 = MUC0.MMO->getOffset(); + int64_t SrcValOffset1 = MUC1.MMO->getOffset(); + unsigned OrigAlignment0 = MUC0.MMO->getBaseAlignment(); + unsigned OrigAlignment1 = MUC1.MMO->getBaseAlignment(); if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 && - NumBytes0 == NumBytes1 && OrigAlignment0 > NumBytes0) { + MUC0.NumBytes.hasValue() && MUC1.NumBytes.hasValue() && + *MUC0.NumBytes == *MUC1.NumBytes && OrigAlignment0 > *MUC0.NumBytes) { int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0; int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1; // There is no overlap between these relatively aligned accesses of // similar size. Return no alias. - if ((OffAlign0 + NumBytes0) <= OffAlign1 || - (OffAlign1 + NumBytes1) <= OffAlign0) + if ((OffAlign0 + *MUC0.NumBytes) <= OffAlign1 || + (OffAlign1 + *MUC1.NumBytes) <= OffAlign0) return false; } @@ -19105,17 +20529,16 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { UseAA = false; #endif - if (UseAA && AA && - Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) { + if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue()) { // Use alias analysis information. int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1); - int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset; - int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset; - AliasResult AAResult = - AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0, - UseTBAA ? Op0->getAAInfo() : AAMDNodes()), - MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1, - UseTBAA ? Op1->getAAInfo() : AAMDNodes()) ); + int64_t Overlap0 = *MUC0.NumBytes + SrcValOffset0 - MinOffset; + int64_t Overlap1 = *MUC1.NumBytes + SrcValOffset1 - MinOffset; + AliasResult AAResult = AA->alias( + MemoryLocation(MUC0.MMO->getValue(), Overlap0, + UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()), + MemoryLocation(MUC1.MMO->getValue(), Overlap1, + UseTBAA ? MUC1.MMO->getAAInfo() : AAMDNodes())); if (AAResult == NoAlias) return false; } @@ -19132,18 +20555,64 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, SmallPtrSet<SDNode *, 16> Visited; // Visited node set. // Get alias information for node. - bool IsLoad = isa<LoadSDNode>(N) && !cast<LSBaseSDNode>(N)->isVolatile(); + const bool IsLoad = isa<LoadSDNode>(N) && !cast<LoadSDNode>(N)->isVolatile(); // Starting off. Chains.push_back(OriginalChain); unsigned Depth = 0; + // Attempt to improve chain by a single step + std::function<bool(SDValue &)> ImproveChain = [&](SDValue &C) -> bool { + switch (C.getOpcode()) { + case ISD::EntryToken: + // No need to mark EntryToken. + C = SDValue(); + return true; + case ISD::LOAD: + case ISD::STORE: { + // Get alias information for C. + bool IsOpLoad = isa<LoadSDNode>(C.getNode()) && + !cast<LSBaseSDNode>(C.getNode())->isVolatile(); + if ((IsLoad && IsOpLoad) || !isAlias(N, C.getNode())) { + // Look further up the chain. + C = C.getOperand(0); + return true; + } + // Alias, so stop here. + return false; + } + + case ISD::CopyFromReg: + // Always forward past past CopyFromReg. + C = C.getOperand(0); + return true; + + case ISD::LIFETIME_START: + case ISD::LIFETIME_END: { + // We can forward past any lifetime start/end that can be proven not to + // alias the memory access. + if (!isAlias(N, C.getNode())) { + // Look further up the chain. + C = C.getOperand(0); + return true; + } + return false; + } + default: + return false; + } + }; + // Look at each chain and determine if it is an alias. If so, add it to the // aliases list. If not, then continue up the chain looking for the next // candidate. while (!Chains.empty()) { SDValue Chain = Chains.pop_back_val(); + // Don't bother if we've seen Chain before. + if (!Visited.insert(Chain.getNode()).second) + continue; + // For TokenFactor nodes, look at each operand and only continue up the // chain until we reach the depth limit. // @@ -19156,58 +20625,30 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, return; } - // Don't bother if we've been before. - if (!Visited.insert(Chain.getNode()).second) - continue; - - switch (Chain.getOpcode()) { - case ISD::EntryToken: - // Entry token is ideal chain operand, but handled in FindBetterChain. - break; - - case ISD::LOAD: - case ISD::STORE: { - // Get alias information for Chain. - bool IsOpLoad = isa<LoadSDNode>(Chain.getNode()) && - !cast<LSBaseSDNode>(Chain.getNode())->isVolatile(); - - // If chain is alias then stop here. - if (!(IsLoad && IsOpLoad) && - isAlias(cast<LSBaseSDNode>(N), cast<LSBaseSDNode>(Chain.getNode()))) { - Aliases.push_back(Chain); - } else { - // Look further up the chain. - Chains.push_back(Chain.getOperand(0)); - ++Depth; - } - break; - } - - case ISD::TokenFactor: + if (Chain.getOpcode() == ISD::TokenFactor) { // We have to check each of the operands of the token factor for "small" // token factors, so we queue them up. Adding the operands to the queue // (stack) in reverse order maintains the original order and increases the // likelihood that getNode will find a matching token factor (CSE.) if (Chain.getNumOperands() > 16) { Aliases.push_back(Chain); - break; + continue; } for (unsigned n = Chain.getNumOperands(); n;) Chains.push_back(Chain.getOperand(--n)); ++Depth; - break; - - case ISD::CopyFromReg: - // Forward past CopyFromReg. - Chains.push_back(Chain.getOperand(0)); + continue; + } + // Everything else + if (ImproveChain(Chain)) { + // Updated Chain Found, Consider new chain if one exists. + if (Chain.getNode()) + Chains.push_back(Chain); ++Depth; - break; - - default: - // For all other instructions we will just have to take what we can get. - Aliases.push_back(Chain); - break; + continue; } + // No Improved Chain Possible, treat as Alias. + Aliases.push_back(Chain); } } @@ -19232,13 +20673,15 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { return Aliases[0]; // Construct a custom tailored token factor. - return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases); + return DAG.getTokenFactor(SDLoc(N), Aliases); } +namespace { // TODO: Replace with with std::monostate when we move to C++17. struct UnitT { } Unit; bool operator==(const UnitT &, const UnitT &) { return true; } bool operator!=(const UnitT &, const UnitT &) { return false; } +} // namespace // This function tries to collect a bunch of potentially interesting // nodes to improve the chains of, all at once. This might seem @@ -19349,7 +20792,7 @@ bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) { if (AddNewChain) TFOps.insert(TFOps.begin(), NewChain); - SDValue TF = DAG.getNode(ISD::TokenFactor, SDLoc(STChain), MVT::Other, TFOps); + SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps); CombineTo(St, TF); AddToWorklist(STChain); |