diff options
Diffstat (limited to 'llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp')
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 906 |
1 files changed, 645 insertions, 261 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index e8950b58d42d..e5bc08b9280a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -131,6 +131,7 @@ namespace { const TargetLowering &TLI; CombineLevel Level; CodeGenOpt::Level OptLevel; + bool LegalDAG = false; bool LegalOperations = false; bool LegalTypes = false; bool ForCodeSize; @@ -179,6 +180,12 @@ namespace { AddToWorklist(Node); } + /// Convenient shorthand to add a node and all of its user to the worklist. + void AddToWorklistWithUsers(SDNode *N) { + AddUsersToWorklist(N); + AddToWorklist(N); + } + // Prune potentially dangling nodes. This is called after // any visit to a node, but should also be called during a visit after any // failed combine which may have created a DAG node. @@ -217,14 +224,16 @@ namespace { DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) { - ForCodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + ForCodeSize = DAG.shouldOptForSize(); MaximumLegalStoreInBits = 0; + // We use the minimum store size here, since that's all we can guarantee + // for the scalable vector types. for (MVT VT : MVT::all_valuetypes()) if (EVT(VT).isSimple() && VT != MVT::Other && TLI.isTypeLegal(EVT(VT)) && - VT.getSizeInBits() >= MaximumLegalStoreInBits) - MaximumLegalStoreInBits = VT.getSizeInBits(); + VT.getSizeInBits().getKnownMinSize() >= MaximumLegalStoreInBits) + MaximumLegalStoreInBits = VT.getSizeInBits().getKnownMinSize(); } void ConsiderForPruning(SDNode *N) { @@ -622,7 +631,7 @@ namespace { ConstantSDNode *Mask, SDNode *&NodeToMask); /// Attempt to propagate a given AND node back to load leaves so that they /// can be combined into narrow loads. - bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG); + bool BackwardsPropagateMask(SDNode *N); /// Helper function for MergeConsecutiveStores which merges the /// component store chains. @@ -1026,8 +1035,7 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New); // Push the new node and any (possibly new) users onto the worklist. - AddToWorklist(TLO.New.getNode()); - AddUsersToWorklist(TLO.New.getNode()); + AddToWorklistWithUsers(TLO.New.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to @@ -1393,6 +1401,7 @@ bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) { void DAGCombiner::Run(CombineLevel AtLevel) { // set the instance variables, so that the various visit routines may use it. Level = AtLevel; + LegalDAG = Level >= AfterLegalizeDAG; LegalOperations = Level >= AfterLegalizeVectorOps; LegalTypes = Level >= AfterLegalizeTypes; @@ -1419,14 +1428,13 @@ void DAGCombiner::Run(CombineLevel AtLevel) { // If this combine is running after legalizing the DAG, re-legalize any // nodes pulled off the worklist. - if (Level == AfterLegalizeDAG) { + if (LegalDAG) { SmallSetVector<SDNode *, 16> UpdatedNodes; bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes); - for (SDNode *LN : UpdatedNodes) { - AddUsersToWorklist(LN); - AddToWorklist(LN); - } + for (SDNode *LN : UpdatedNodes) + AddToWorklistWithUsers(LN); + if (!NIsValid) continue; } @@ -2800,6 +2808,96 @@ static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, return SDValue(); } +// If we are facing some sort of diamond carry/borrow in/out pattern try to +// match patterns like: +// +// (uaddo A, B) CarryIn +// | \ | +// | \ | +// PartialSum PartialCarryOutX / +// | | / +// | ____|____________/ +// | / | +// (uaddo *, *) \________ +// | \ \ +// | \ | +// | PartialCarryOutY | +// | \ | +// | \ / +// AddCarrySum | ______/ +// | / +// CarryOut = (or *, *) +// +// And generate ADDCARRY (or SUBCARRY) with two result values: +// +// {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn) +// +// Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with +// a single path for carry/borrow out propagation: +static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, + const TargetLowering &TLI, SDValue Carry0, + SDValue Carry1, SDNode *N) { + if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1) + return SDValue(); + unsigned Opcode = Carry0.getOpcode(); + if (Opcode != Carry1.getOpcode()) + return SDValue(); + if (Opcode != ISD::UADDO && Opcode != ISD::USUBO) + return SDValue(); + + // Canonicalize the add/sub of A and B as Carry0 and the add/sub of the + // carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in + // the above ASCII art.) + if (Carry1.getOperand(0) != Carry0.getValue(0) && + Carry1.getOperand(1) != Carry0.getValue(0)) + std::swap(Carry0, Carry1); + if (Carry1.getOperand(0) != Carry0.getValue(0) && + Carry1.getOperand(1) != Carry0.getValue(0)) + return SDValue(); + + // The carry in value must be on the righthand side for subtraction. + unsigned CarryInOperandNum = + Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0; + if (Opcode == ISD::USUBO && CarryInOperandNum != 1) + return SDValue(); + SDValue CarryIn = Carry1.getOperand(CarryInOperandNum); + + unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY; + if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType())) + return SDValue(); + + // Verify that the carry/borrow in is plausibly a carry/borrow bit. + // TODO: make getAsCarry() aware of how partial carries are merged. + if (CarryIn.getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + CarryIn = CarryIn.getOperand(0); + if (CarryIn.getValueType() != MVT::i1) + return SDValue(); + + SDLoc DL(N); + SDValue Merged = + DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0), + Carry0.getOperand(1), CarryIn); + + // Please note that because we have proven that the result of the UADDO/USUBO + // of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can + // therefore prove that if the first UADDO/USUBO overflows, the second + // UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the + // maximum value. + // + // 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry + // 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow) + // + // This is important because it means that OR and XOR can be used to merge + // carry flags; and that AND can return a constant zero. + // + // TODO: match other operations that can merge flags (ADD, etc) + DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0)); + if (N->getOpcode() == ISD::AND) + return DAG.getConstant(0, DL, MVT::i1); + return Merged.getValue(1); +} + SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N) { // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry. @@ -3006,6 +3104,20 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1), N1.getOperand(0))); + // A - (A & B) -> A & (~B) + if (N1.getOpcode() == ISD::AND) { + SDValue A = N1.getOperand(0); + SDValue B = N1.getOperand(1); + if (A != N0) + std::swap(A, B); + if (A == N0 && + (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true))) { + SDValue InvB = + DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT)); + return DAG.getNode(ISD::AND, DL, VT, A, InvB); + } + } + // fold (X - (-Y * Z)) -> (X + (Y * Z)) if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) { if (N1.getOperand(0).getOpcode() == ISD::SUB && @@ -4225,7 +4337,6 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX. // Only do this if the current op isn't legal and the flipped is. unsigned Opcode = N->getOpcode(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegal(Opcode, VT) && (N0.isUndef() || DAG.SignBitIsZero(N0)) && (N1.isUndef() || DAG.SignBitIsZero(N1))) { @@ -4543,8 +4654,8 @@ SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) if (LL == RL && LR == RR) { - ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger) - : ISD::getSetCCOrOperation(CC0, CC1, IsInteger); + ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT) + : ISD::getSetCCOrOperation(CC0, CC1, OpVT); if (NewCC != ISD::SETCC_INVALID && (!LegalOperations || (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) && @@ -4856,7 +4967,7 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N, return true; } -bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) { +bool DAGCombiner::BackwardsPropagateMask(SDNode *N) { auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (!Mask) return false; @@ -5092,6 +5203,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (SDValue Shuffle = XformToShuffleWithZero(N)) return Shuffle; + if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) + return Combined; + // fold (and (or x, C), D) -> D if (C & D) == D auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); @@ -5238,14 +5352,13 @@ SDValue DAGCombiner::visitAND(SDNode *N) { } } - if (Level >= AfterLegalizeTypes) { + if (LegalTypes) { // Attempt to propagate the AND back up to the leaves which, if they're // loads, can be combined to narrow loads and the AND node can be removed. // Perform after legalization so that extend nodes will already be // combined into the loads. - if (BackwardsPropagateMask(N, DAG)) { + if (BackwardsPropagateMask(N)) return SDValue(N, 0); - } } if (SDValue Combined = visitANDLike(N0, N1, N)) @@ -5787,6 +5900,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SDValue Combined = visitORLike(N0, N1, N)) return Combined; + if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) + return Combined; + // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) if (SDValue BSwap = MatchBSwapHWord(N, N0, N1)) return BSwap; @@ -6418,7 +6534,7 @@ static unsigned BigEndianByteAt(unsigned BW, unsigned i) { // Check if the bytes offsets we are looking at match with either big or // little endian value loaded. Return true for big endian, false for little // endian, and None if match failed. -static Optional<bool> isBigEndian(const SmallVector<int64_t, 4> &ByteOffsets, +static Optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets, int64_t FirstOffset) { // The endian can be decided only when it is 2 bytes at least. unsigned Width = ByteOffsets.size(); @@ -6491,7 +6607,6 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) { if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT)) return SDValue(); @@ -6499,7 +6614,7 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) { // to the same base address. Collect bytes offsets from Base address into // ByteOffsets. SDValue CombinedValue; - SmallVector<int64_t, 4> ByteOffsets(Width, INT64_MAX); + SmallVector<int64_t, 8> ByteOffsets(Width, INT64_MAX); int64_t FirstOffset = INT64_MAX; StoreSDNode *FirstStore = nullptr; Optional<BaseIndexOffset> Base; @@ -6655,13 +6770,6 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { return SDValue(); unsigned ByteWidth = VT.getSizeInBits() / 8; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // Before legalize we can introduce too wide illegal loads which will be later - // split into legal sized loads. This enables us to combine i64 load by i8 - // patterns to a couple of i32 loads on 32 bit targets. - if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) - return SDValue(); - bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto MemoryByteOffset = [&] (ByteProvider P) { assert(P.isMemory() && "Must be a memory byte provider"); @@ -6683,12 +6791,22 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // Check if all the bytes of the OR we are looking at are loaded from the same // base address. Collect bytes offsets from Base address in ByteOffsets. - SmallVector<int64_t, 4> ByteOffsets(ByteWidth); - for (unsigned i = 0; i < ByteWidth; i++) { + SmallVector<int64_t, 8> ByteOffsets(ByteWidth); + unsigned ZeroExtendedBytes = 0; + for (int i = ByteWidth - 1; i >= 0; --i) { auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); - if (!P || !P->isMemory()) // All the bytes must be loaded from memory + if (!P) return SDValue(); + if (P->isConstantZero()) { + // It's OK for the N most significant bytes to be 0, we can just + // zero-extend the load. + if (++ZeroExtendedBytes != (ByteWidth - static_cast<unsigned>(i))) + return SDValue(); + continue; + } + assert(P->isMemory() && "provenance should either be memory or zero"); + LoadSDNode *L = P->Load; assert(L->hasNUsesOfValue(1, 0) && L->isSimple() && !L->isIndexed() && @@ -6727,9 +6845,26 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { assert(Base && "Base address of the accessed memory location must be set"); assert(FirstOffset != INT64_MAX && "First byte offset must be set"); + bool NeedsZext = ZeroExtendedBytes > 0; + + EVT MemVT = + EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8); + + if (!MemVT.isSimple()) + return SDValue(); + + // Before legalize we can introduce too wide illegal loads which will be later + // split into legal sized loads. This enables us to combine i64 load by i8 + // patterns to a couple of i32 loads on 32 bit targets. + if (LegalOperations && + !TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, + MemVT)) + return SDValue(); + // Check if the bytes of the OR we are looking at match with either big or // little endian value load - Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset); + Optional<bool> IsBigEndian = isBigEndian( + makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset); if (!IsBigEndian.hasValue()) return SDValue(); @@ -6742,7 +6877,8 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { LoadSDNode *FirstLoad = FirstByteProvider->Load; // The node we are looking at matches with the pattern, check if we can - // replace it with a single load and bswap if needed. + // replace it with a single (possibly zero-extended) load and bswap + shift if + // needed. // If the load needs byte swap check if the target supports it bool NeedsBswap = IsBigEndianTarget != *IsBigEndian; @@ -6750,25 +6886,45 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // Before legalize we can introduce illegal bswaps which will be later // converted to an explicit bswap sequence. This way we end up with a single // load and byte shuffling instead of several loads and byte shuffling. - if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) + // We do not introduce illegal bswaps when zero-extending as this tends to + // introduce too many arithmetic instructions. + if (NeedsBswap && (LegalOperations || NeedsZext) && + !TLI.isOperationLegal(ISD::BSWAP, VT)) + return SDValue(); + + // If we need to bswap and zero extend, we have to insert a shift. Check that + // it is legal. + if (NeedsBswap && NeedsZext && LegalOperations && + !TLI.isOperationLegal(ISD::SHL, VT)) return SDValue(); // Check that a load of the wide type is both allowed and fast on the target bool Fast = false; - bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - VT, *FirstLoad->getMemOperand(), &Fast); + bool Allowed = + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + *FirstLoad->getMemOperand(), &Fast); if (!Allowed || !Fast) return SDValue(); - SDValue NewLoad = - DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), - FirstLoad->getPointerInfo(), FirstLoad->getAlignment()); + SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, + SDLoc(N), VT, Chain, FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), MemVT, + FirstLoad->getAlignment()); // Transfer chain users from old loads to the new load. for (LoadSDNode *L : Loads) DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); - return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad; + if (!NeedsBswap) + return NewLoad; + + SDValue ShiftedLoad = + NeedsZext + ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad, + DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT, + SDLoc(N), LegalOperations)) + : NewLoad; + return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad); } // If the target has andn, bsl, or a similar bit-select instruction, @@ -6904,7 +7060,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { SDValue LHS, RHS, CC; if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) { ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), - LHS.getValueType().isInteger()); + LHS.getValueType()); if (!LegalOperations || TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) { switch (N0Opcode) { @@ -6964,6 +7120,13 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { DAG.getAllOnesConstant(DL, VT)); } + // fold (not (add X, -1)) -> (neg X) + if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::ADD && + isAllOnesOrAllOnesSplat(N0.getOperand(1))) { + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + N0.getOperand(0)); + } + // fold (xor (and x, y), y) -> (and (not x), y) if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) { SDValue X = N0.getOperand(0); @@ -7051,6 +7214,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); + if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N)) + return Combined; + return SDValue(); } @@ -7567,8 +7733,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { if (VT.isVector()) ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, VT.getVectorNumElements()); - if ((!LegalOperations || - TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT))) + if (!LegalOperations || + TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) == + TargetLowering::Legal) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), DAG.getValueType(ExtVT)); } @@ -7776,26 +7943,40 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { } } - // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2))) - // TODO - support non-uniform vector shift amounts. if (N1C && N0.getOpcode() == ISD::TRUNCATE && N0.getOperand(0).getOpcode() == ISD::SRL) { - if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) { + SDValue InnerShift = N0.getOperand(0); + // TODO - support non-uniform vector shift amounts. + if (auto *N001C = isConstOrConstSplat(InnerShift.getOperand(1))) { uint64_t c1 = N001C->getZExtValue(); uint64_t c2 = N1C->getZExtValue(); - EVT InnerShiftVT = N0.getOperand(0).getValueType(); - EVT ShiftCountVT = N0.getOperand(0).getOperand(1).getValueType(); + EVT InnerShiftVT = InnerShift.getValueType(); + EVT ShiftAmtVT = InnerShift.getOperand(1).getValueType(); uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); + // srl (trunc (srl x, c1)), c2 --> 0 or (trunc (srl x, (add c1, c2))) // This is only valid if the OpSizeInBits + c1 = size of inner shift. if (c1 + OpSizeInBits == InnerShiftSize) { - SDLoc DL(N0); + SDLoc DL(N); if (c1 + c2 >= InnerShiftSize) return DAG.getConstant(0, DL, VT); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(ISD::SRL, DL, InnerShiftVT, - N0.getOperand(0).getOperand(0), - DAG.getConstant(c1 + c2, DL, - ShiftCountVT))); + SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT); + SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT, + InnerShift.getOperand(0), NewShiftAmt); + return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift); + } + // In the more general case, we can clear the high bits after the shift: + // srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask) + if (N0.hasOneUse() && InnerShift.hasOneUse() && + c1 + c2 < InnerShiftSize) { + SDLoc DL(N); + SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT); + SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT, + InnerShift.getOperand(0), NewShiftAmt); + SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize, + OpSizeInBits - c2), + DL, InnerShiftVT); + SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask); + return DAG.getNode(ISD::TRUNCATE, DL, VT, And); } } } @@ -8585,6 +8766,10 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { if (ISD::isBuildVectorAllZeros(Mask.getNode())) return Chain; + // Try transforming N to an indexed store. + if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) + return SDValue(N, 0); + return SDValue(); } @@ -8609,6 +8794,10 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { if (ISD::isBuildVectorAllZeros(Mask.getNode())) return CombineTo(N, MLD->getPassThru(), MLD->getChain()); + // Try transforming N to an indexed load. + if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) + return SDValue(N, 0); + return SDValue(); } @@ -9108,6 +9297,8 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) return SDValue(); + assert(!DstVT.isScalableVector() && "Unexpected scalable vector type"); + SDLoc DL(N); const unsigned NumSplits = DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements(); @@ -9125,8 +9316,7 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align, LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); - BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, - DAG.getConstant(Stride, DL, BasePtr.getValueType())); + BasePtr = DAG.getMemBasePlusOffset(BasePtr, Stride, DL); Loads.push_back(SplitLoad.getValue(0)); Chains.push_back(SplitLoad.getValue(1)); @@ -9365,11 +9555,10 @@ static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG, SDLoc dl(Ld); SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru()); - SDValue NewLoad = DAG.getMaskedLoad(VT, dl, Ld->getChain(), - Ld->getBasePtr(), Ld->getMask(), - PassThru, Ld->getMemoryVT(), - Ld->getMemOperand(), ExtLoadType, - Ld->isExpandingLoad()); + SDValue NewLoad = DAG.getMaskedLoad( + VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(), + PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(), + ExtLoadType, Ld->isExpandingLoad()); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1)); return NewLoad; } @@ -9397,10 +9586,15 @@ static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG, // sext i1 (setgt iN X, -1) --> sra (not X), (N - 1) // zext i1 (setgt iN X, -1) --> srl (not X), (N - 1) SDLoc DL(N); - SDValue NotX = DAG.getNOT(DL, X, VT); - SDValue ShiftAmount = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); - auto ShiftOpcode = N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL; - return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount); + unsigned ShCt = VT.getSizeInBits() - 1; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) { + SDValue NotX = DAG.getNOT(DL, X, VT); + SDValue ShiftAmount = DAG.getConstant(ShCt, DL, VT); + auto ShiftOpcode = + N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL; + return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount); + } } return SDValue(); } @@ -9671,6 +9865,29 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, return (Known.Zero | 1).isAllOnesValue(); } +/// Given an extending node with a pop-count operand, if the target does not +/// support a pop-count in the narrow source type but does support it in the +/// destination type, widen the pop-count to the destination type. +static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) { + assert((Extend->getOpcode() == ISD::ZERO_EXTEND || + Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op"); + + SDValue CtPop = Extend->getOperand(0); + if (CtPop.getOpcode() != ISD::CTPOP || !CtPop.hasOneUse()) + return SDValue(); + + EVT VT = Extend->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isOperationLegalOrCustom(ISD::CTPOP, CtPop.getValueType()) || + !TLI.isOperationLegalOrCustom(ISD::CTPOP, VT)) + return SDValue(); + + // zext (ctpop X) --> ctpop (zext X) + SDLoc DL(Extend); + SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT); + return DAG.getNode(ISD::CTPOP, DL, VT, NewZext); +} + SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -9921,6 +10138,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; + if (SDValue NewCtPop = widenCtPop(N, DAG)) + return NewCtPop; + return SDValue(); } @@ -10067,6 +10287,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { return SCC; } + if (SDValue NewCtPop = widenCtPop(N, DAG)) + return NewCtPop; + return SDValue(); } @@ -10273,17 +10496,14 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { if (DAG.getDataLayout().isBigEndian()) ShAmt = AdjustBigEndianShift(ShAmt); - EVT PtrType = N0.getOperand(1).getValueType(); uint64_t PtrOff = ShAmt / 8; unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); SDLoc DL(LN0); // The original load itself didn't wrap, so an offset within it doesn't. SDNodeFlags Flags; Flags.setNoUnsignedWrap(true); - SDValue NewPtr = DAG.getNode(ISD::ADD, DL, - PtrType, LN0->getBasePtr(), - DAG.getConstant(PtrOff, DL, PtrType), - Flags); + SDValue NewPtr = + DAG.getMemBasePlusOffset(LN0->getBasePtr(), PtrOff, DL, Flags); AddToWorklist(NewPtr.getNode()); SDValue Load; @@ -10735,16 +10955,16 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) { SDValue VecSrc = N0.getOperand(0); - EVT SrcVT = VecSrc.getValueType(); - if (SrcVT.isVector() && SrcVT.getScalarType() == VT && + EVT VecSrcVT = VecSrc.getValueType(); + if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT && (!LegalOperations || - TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, SrcVT))) { + TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) { SDLoc SL(N); EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); - unsigned Idx = isLE ? 0 : SrcVT.getVectorNumElements() - 1; - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, - VecSrc, DAG.getConstant(Idx, SL, IdxVT)); + unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1; + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc, + DAG.getConstant(Idx, SL, IdxVT)); } } @@ -11299,11 +11519,11 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { const TargetOptions &Options = DAG.getTarget().Options; // Floating-point multiply-add with intermediate rounding. - bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); + bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = - TLI.isFMAFasterThanFMulAndFAdd(VT) && + TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. @@ -11359,7 +11579,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -11373,7 +11594,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N10.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), @@ -11427,7 +11649,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); if (isContractableFMUL(N020) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N020.getValueType())) { return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), N020.getOperand(0), N020.getOperand(1), N1, Flags); @@ -11456,7 +11679,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); if (isContractableFMUL(N002) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N00.getValueType())) { return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), N002.getOperand(0), N002.getOperand(1), N1, Flags); @@ -11471,7 +11695,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (N12.getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N12.getOperand(0); if (isContractableFMUL(N120) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N120.getValueType())) { return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), N120.getOperand(0), N120.getOperand(1), N0, Flags); @@ -11489,7 +11714,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (N10.getOpcode() == PreferredFusedOpcode) { SDValue N102 = N10.getOperand(2); if (isContractableFMUL(N102) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N10.getValueType())) { return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), N102.getOperand(0), N102.getOperand(1), N0, Flags); @@ -11510,11 +11736,11 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { const TargetOptions &Options = DAG.getTarget().Options; // Floating-point multiply-add with intermediate rounding. - bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); + bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = - TLI.isFMAFasterThanFMulAndFAdd(VT) && + TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. @@ -11579,7 +11805,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -11595,7 +11822,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N10.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -11617,7 +11845,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (N00.getOpcode() == ISD::FNEG) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N00.getValueType())) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -11640,7 +11869,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (N00.getOpcode() == ISD::FP_EXTEND) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N000.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N000.getValueType())) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, @@ -11671,7 +11901,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub x, (fma y, z, (fmul u, v))) // -> (fma (fneg y), z, (fma (fneg u), v, x)) if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && - isContractableFMUL(N1.getOperand(2))) { + isContractableFMUL(N1.getOperand(2)) && + N1->hasOneUse()) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -11686,12 +11917,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) - if (N0.getOpcode() == PreferredFusedOpcode) { + if (N0.getOpcode() == PreferredFusedOpcode && + N0->hasOneUse()) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); if (isContractableFMUL(N020) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N020.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -11716,7 +11949,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); if (isContractableFMUL(N002) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), @@ -11736,10 +11970,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) if (N1.getOpcode() == PreferredFusedOpcode && - N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { + N1.getOperand(2).getOpcode() == ISD::FP_EXTEND && + N1->hasOneUse()) { SDValue N120 = N1.getOperand(2).getOperand(0); if (isContractableFMUL(N120) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + N120.getValueType())) { SDValue N1200 = N120.getOperand(0); SDValue N1201 = N120.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -11768,7 +12004,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N101 = CvtSrc.getOperand(1); SDValue N102 = CvtSrc.getOperand(2); if (isContractableFMUL(N102) && - TLI.isFPExtFoldable(PreferredFusedOpcode, VT, CvtSrc.getValueType())) { + TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, + CvtSrc.getValueType())) { SDValue N1020 = N102.getOperand(0); SDValue N1021 = N102.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -11812,7 +12049,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { // Floating-point multiply-add without intermediate rounding. bool HasFMA = (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && - TLI.isFMAFasterThanFMulAndFAdd(VT) && + TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // Floating-point multiply-add with intermediate rounding. This can result @@ -12402,6 +12639,15 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { } } + // fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z)) + // fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z)) + if (!TLI.isFNegFree(VT) && + TLI.isNegatibleForFree(SDValue(N, 0), DAG, LegalOperations, + ForCodeSize) == 2) + return DAG.getNode(ISD::FNEG, DL, VT, + TLI.getNegatedExpression(SDValue(N, 0), DAG, + LegalOperations, ForCodeSize), + Flags); return SDValue(); } @@ -12738,7 +12984,7 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { // Assume that libcalls are the smallest code. // TODO: This restriction should probably be lifted for vectors. - if (DAG.getMachineFunction().getFunction().hasOptSize()) + if (ForCodeSize) return SDValue(); // pow(X, 0.25) --> sqrt(sqrt(X)) @@ -13135,6 +13381,16 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { if (TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize)) return TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); + // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 FIXME: This is + // duplicated in isNegatibleForFree, but isNegatibleForFree doesn't know it + // was called from a context with a nsz flag if the input fsub does not. + if (N0.getOpcode() == ISD::FSUB && + (DAG.getTarget().Options.NoSignedZerosFPMath || + N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) { + return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1), + N0.getOperand(0), N->getFlags()); + } + // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading // constant pool values. if (!TLI.isFNegFree(VT) && @@ -13168,9 +13424,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { if (CFP1) { APFloat CVal = CFP1->getValueAPF(); CVal.changeSign(); - if (Level >= AfterLegalizeDAG && - (TLI.isFPImmLegal(CVal, VT, ForCodeSize) || - TLI.isOperationLegal(ISD::ConstantFP, VT))) + if (LegalDAG && (TLI.isFPImmLegal(CVal, VT, ForCodeSize) || + TLI.isOperationLegal(ISD::ConstantFP, VT))) return DAG.getNode( ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)), @@ -13423,12 +13678,22 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, EVT VT; unsigned AS; - if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) { + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) { if (LD->isIndexed() || LD->getBasePtr().getNode() != N) return false; VT = LD->getMemoryVT(); AS = LD->getAddressSpace(); - } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) { + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) { + if (ST->isIndexed() || ST->getBasePtr().getNode() != N) + return false; + VT = ST->getMemoryVT(); + AS = ST->getAddressSpace(); + } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) { + if (LD->isIndexed() || LD->getBasePtr().getNode() != N) + return false; + VT = LD->getMemoryVT(); + AS = LD->getAddressSpace(); + } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) { if (ST->isIndexed() || ST->getBasePtr().getNode() != N) return false; VT = ST->getMemoryVT(); @@ -13462,38 +13727,64 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, VT.getTypeForEVT(*DAG.getContext()), AS); } -/// Try turning a load/store into a pre-indexed load/store when the base -/// pointer is an add or subtract and it has other uses besides the load/store. -/// After the transformation, the new indexed load/store has effectively folded -/// the add/subtract in and all of its other uses are redirected to the -/// new load/store. -bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { - if (Level < AfterLegalizeDAG) - return false; - - bool isLoad = true; - SDValue Ptr; - EVT VT; - if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { +static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec, + bool &IsLoad, bool &IsMasked, SDValue &Ptr, + const TargetLowering &TLI) { + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { if (LD->isIndexed()) return false; - VT = LD->getMemoryVT(); - if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) && - !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT)) + EVT VT = LD->getMemoryVT(); + if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT)) return false; Ptr = LD->getBasePtr(); - } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { if (ST->isIndexed()) return false; - VT = ST->getMemoryVT(); - if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) && - !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT)) + EVT VT = ST->getMemoryVT(); + if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT)) return false; Ptr = ST->getBasePtr(); - isLoad = false; + IsLoad = false; + } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { + if (LD->isIndexed()) + return false; + EVT VT = LD->getMemoryVT(); + if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) && + !TLI.isIndexedMaskedLoadLegal(Dec, VT)) + return false; + Ptr = LD->getBasePtr(); + IsMasked = true; + } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { + if (ST->isIndexed()) + return false; + EVT VT = ST->getMemoryVT(); + if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) && + !TLI.isIndexedMaskedStoreLegal(Dec, VT)) + return false; + Ptr = ST->getBasePtr(); + IsLoad = false; + IsMasked = true; } else { return false; } + return true; +} + +/// Try turning a load/store into a pre-indexed load/store when the base +/// pointer is an add or subtract and it has other uses besides the load/store. +/// After the transformation, the new indexed load/store has effectively folded +/// the add/subtract in and all of its other uses are redirected to the +/// new load/store. +bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { + if (Level < AfterLegalizeDAG) + return false; + + bool IsLoad = true; + bool IsMasked = false; + SDValue Ptr; + if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked, + Ptr, TLI)) + return false; // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail // out. There is no reason to make this a preinc/predec. @@ -13535,8 +13826,9 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { return false; // Check #2. - if (!isLoad) { - SDValue Val = cast<StoreSDNode>(N)->getValue(); + if (!IsLoad) { + SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue() + : cast<StoreSDNode>(N)->getValue(); // Would require a copy. if (Val == BasePtr) @@ -13612,18 +13904,26 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { return false; SDValue Result; - if (isLoad) - Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N), - BasePtr, Offset, AM); - else - Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N), - BasePtr, Offset, AM); + if (!IsMasked) { + if (IsLoad) + Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); + else + Result = + DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); + } else { + if (IsLoad) + Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr, + Offset, AM); + else + Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr, + Offset, AM); + } ++PreIndexedNodes; ++NodesCombined; LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); - if (isLoad) { + if (IsLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { @@ -13677,7 +13977,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { // We can now generate the new expression. SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0)); - SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0); + SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0); SDValue NewUse = DAG.getNode(Opcode, DL, @@ -13687,7 +13987,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { } // Replace the uses of Ptr with uses of the updated base value. - DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0)); + DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0)); deleteAndRecombine(Ptr.getNode()); AddToWorklist(Result.getNode()); @@ -13702,29 +14002,12 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { if (Level < AfterLegalizeDAG) return false; - bool isLoad = true; + bool IsLoad = true; + bool IsMasked = false; SDValue Ptr; - EVT VT; - if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { - if (LD->isIndexed()) - return false; - VT = LD->getMemoryVT(); - if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) && - !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT)) - return false; - Ptr = LD->getBasePtr(); - } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { - if (ST->isIndexed()) - return false; - VT = ST->getMemoryVT(); - if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) && - !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT)) - return false; - Ptr = ST->getBasePtr(); - isLoad = false; - } else { + if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, IsMasked, + Ptr, TLI)) return false; - } if (Ptr.getNode()->hasOneUse()) return false; @@ -13760,7 +14043,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { // If all the uses are load / store addresses, then don't do the // transformation. - if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){ + if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) { bool RealUse = false; for (SDNode *UseUse : Use->uses()) { if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) @@ -13786,18 +14069,24 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { Worklist.push_back(Op); if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) && !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) { - SDValue Result = isLoad - ? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N), - BasePtr, Offset, AM) - : DAG.getIndexedStore(SDValue(N,0), SDLoc(N), - BasePtr, Offset, AM); + SDValue Result; + if (!IsMasked) + Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, + Offset, AM) + : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), + BasePtr, Offset, AM); + else + Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), + BasePtr, Offset, AM) + : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), + BasePtr, Offset, AM); ++PostIndexedNodes; ++NodesCombined; LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); - if (isLoad) { + if (IsLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { @@ -13809,7 +14098,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { // Replace the uses of Use with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), - Result.getValue(isLoad ? 1 : 0)); + Result.getValue(IsLoad ? 1 : 0)); deleteAndRecombine(Op); return true; } @@ -13923,8 +14212,8 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { // the stored value). With Offset=n (for n > 0) the loaded value starts at the // n:th least significant byte of the stored value. if (DAG.getDataLayout().isBigEndian()) - Offset = (STMemType.getStoreSizeInBits() - - LDMemType.getStoreSizeInBits()) / 8 - Offset; + Offset = ((int64_t)STMemType.getStoreSizeInBits() - + (int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset; // Check that the stored value cover all bits that are loaded. bool STCoversLD = @@ -14066,7 +14355,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { return V; // Try to infer better alignment information than the load already has. - if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) { + if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) { if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { if (Align > LD->getAlignment() && LD->getSrcValueOffset() % Align == 0) { SDValue NewLoad = DAG.getExtLoad( @@ -14786,8 +15075,7 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo, SDValue Ptr = St->getBasePtr(); if (StOffset) { SDLoc DL(IVal); - Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), - Ptr, DAG.getConstant(StOffset, DL, Ptr.getValueType())); + Ptr = DAG.getMemBasePlusOffset(Ptr, StOffset, DL); NewAlign = MinAlign(NewAlign, StOffset); } @@ -14898,10 +15186,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy)) return SDValue(); - SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LD), - Ptr.getValueType(), Ptr, - DAG.getConstant(PtrOff, SDLoc(LD), - Ptr.getValueType())); + SDValue NewPtr = DAG.getMemBasePlusOffset(Ptr, PtrOff, SDLoc(LD)); SDValue NewLD = DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, @@ -15081,7 +15366,7 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( // The latest Node in the DAG. SDLoc DL(StoreNodes[0].MemNode); - int64_t ElementSizeBits = MemVT.getStoreSizeInBits(); + TypeSize ElementSizeBits = MemVT.getStoreSizeInBits(); unsigned SizeInBits = NumStores * ElementSizeBits; unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; @@ -15466,7 +15751,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { Attribute::NoImplicitFloat); // This function cannot currently deal with non-byte-sized memory sizes. - if (ElementSizeBytes * 8 != MemVT.getSizeInBits()) + if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits()) return false; if (!MemVT.isSimple()) @@ -16015,6 +16300,9 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { if (Value.getOpcode() == ISD::TargetConstantFP) return SDValue(); + if (!ISD::isNormalStore(ST)) + return SDValue(); + SDLoc DL(ST); SDValue Chain = ST->getChain(); @@ -16075,8 +16363,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), ST->getAlignment(), MMOFlags, AAInfo); - Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(4, DL, Ptr.getValueType())); + Ptr = DAG.getMemBasePlusOffset(Ptr, 4, DL); Alignment = MinAlign(Alignment, 4U); SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(4), @@ -16111,8 +16398,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT, DAG, *ST->getMemOperand())) { return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, - ST->getPointerInfo(), ST->getAlignment(), - ST->getMemOperand()->getFlags(), ST->getAAInfo()); + ST->getMemOperand()); } } @@ -16121,7 +16407,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { return Chain; // Try to infer better alignment information than the store already has. - if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) { + if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) { if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { if (Align > ST->getAlignment() && ST->getSrcValueOffset() % Align == 0) { SDValue NewStore = @@ -16451,9 +16737,7 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { // Lower value store. SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), ST->getAlignment(), MMOFlags, AAInfo); - Ptr = - DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType())); + Ptr = DAG.getMemBasePlusOffset(Ptr, HalfValBitSize / 8, DL); // Higher value store. SDValue St1 = DAG.getStore(St0, DL, Hi, Ptr, @@ -16464,11 +16748,15 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { /// Convert a disguised subvector insertion into a shuffle: SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { + assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && + "Expected extract_vector_elt"); SDValue InsertVal = N->getOperand(1); SDValue Vec = N->getOperand(0); - // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N), InsIndex) - // --> (vector_shuffle X, Y) + // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N), + // InsIndex) + // --> (vector_shuffle X, Y) and variations where shuffle operands may be + // CONCAT_VECTORS. if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() && InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && isa<ConstantSDNode>(InsertVal.getOperand(1))) { @@ -16481,18 +16769,47 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { // Vec's operand 0 is using indices from 0 to N-1 and // operand 1 from N to 2N - 1, where N is the number of // elements in the vectors. - int XOffset = -1; - if (InsertVal.getOperand(0) == X) { - XOffset = 0; - } else if (InsertVal.getOperand(0) == Y) { - XOffset = X.getValueType().getVectorNumElements(); + SDValue InsertVal0 = InsertVal.getOperand(0); + int ElementOffset = -1; + + // We explore the inputs of the shuffle in order to see if we find the + // source of the extract_vector_elt. If so, we can use it to modify the + // shuffle rather than perform an insert_vector_elt. + SmallVector<std::pair<int, SDValue>, 8> ArgWorkList; + ArgWorkList.emplace_back(Mask.size(), Y); + ArgWorkList.emplace_back(0, X); + + while (!ArgWorkList.empty()) { + int ArgOffset; + SDValue ArgVal; + std::tie(ArgOffset, ArgVal) = ArgWorkList.pop_back_val(); + + if (ArgVal == InsertVal0) { + ElementOffset = ArgOffset; + break; + } + + // Peek through concat_vector. + if (ArgVal.getOpcode() == ISD::CONCAT_VECTORS) { + int CurrentArgOffset = + ArgOffset + ArgVal.getValueType().getVectorNumElements(); + int Step = ArgVal.getOperand(0).getValueType().getVectorNumElements(); + for (SDValue Op : reverse(ArgVal->ops())) { + CurrentArgOffset -= Step; + ArgWorkList.emplace_back(CurrentArgOffset, Op); + } + + // Make sure we went through all the elements and did not screw up index + // computation. + assert(CurrentArgOffset == ArgOffset); + } } - if (XOffset != -1) { + if (ElementOffset != -1) { SmallVector<int, 16> NewMask(Mask.begin(), Mask.end()); auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1)); - NewMask[InsIndex] = XOffset + ExtrIndex->getZExtValue(); + NewMask[InsIndex] = ElementOffset + ExtrIndex->getZExtValue(); assert(NewMask[InsIndex] < (int)(2 * Vec.getValueType().getVectorNumElements()) && NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound"); @@ -16562,13 +16879,14 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue EltNo = N->getOperand(2); SDLoc DL(N); - // If the inserted element is an UNDEF, just use the input vector. - if (InVal.isUndef()) - return InVec; - EVT VT = InVec.getValueType(); unsigned NumElts = VT.getVectorNumElements(); + // Insert into out-of-bounds element is undefined. + if (auto *IndexC = dyn_cast<ConstantSDNode>(EltNo)) + if (IndexC->getZExtValue() >= VT.getVectorNumElements()) + return DAG.getUNDEF(VT); + // Remove redundant insertions: // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && @@ -16683,7 +17001,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, // operand can't represent this new access since the offset is variable. MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace()); } - NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, NewPtr, Offset); + NewPtr = DAG.getMemBasePlusOffset(NewPtr, Offset, DL); // The replacement we need to do here is a little tricky: we need to // replace an extractelement of a load with a load. @@ -16723,8 +17041,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT, AddToWorklist(EVE); // Since we're explicitly calling ReplaceAllUses, add the new node to the // worklist explicitly as well. - AddUsersToWorklist(Load.getNode()); // Add users too - AddToWorklist(Load.getNode()); + AddToWorklistWithUsers(Load.getNode()); ++OpsNarrowed; return SDValue(EVE, 0); } @@ -18239,22 +18556,61 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { return DAG.getBitcast(NVT, NewExtract); } } - // TODO - handle (DestNumElts % SrcNumElts) == 0 + if ((DestNumElts % SrcNumElts) == 0) { + unsigned DestSrcRatio = DestNumElts / SrcNumElts; + if ((NVT.getVectorNumElements() % DestSrcRatio) == 0) { + unsigned NewExtNumElts = NVT.getVectorNumElements() / DestSrcRatio; + EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), + SrcVT.getScalarType(), NewExtNumElts); + if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && + TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { + unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; + SDLoc DL(N); + SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, + V.getOperand(0), NewIndex); + return DAG.getBitcast(NVT, NewExtract); + } + } + } } - // Combine: - // (extract_subvec (concat V1, V2, ...), i) - // Into: - // Vi if possible - // Only operand 0 is checked as 'concat' assumes all inputs of the same - // type. - if (V.getOpcode() == ISD::CONCAT_VECTORS && isa<ConstantSDNode>(Index) && - V.getOperand(0).getValueType() == NVT) { - unsigned Idx = N->getConstantOperandVal(1); - unsigned NumElems = NVT.getVectorNumElements(); - assert((Idx % NumElems) == 0 && - "IDX in concat is not a multiple of the result vector length."); - return V->getOperand(Idx / NumElems); + if (V.getOpcode() == ISD::CONCAT_VECTORS && isa<ConstantSDNode>(Index)) { + EVT ConcatSrcVT = V.getOperand(0).getValueType(); + assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() && + "Concat and extract subvector do not change element type"); + + unsigned ExtIdx = N->getConstantOperandVal(1); + unsigned ExtNumElts = NVT.getVectorNumElements(); + assert(ExtIdx % ExtNumElts == 0 && + "Extract index is not a multiple of the input vector length."); + + unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorNumElements(); + unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts; + + // If the concatenated source types match this extract, it's a direct + // simplification: + // extract_subvec (concat V1, V2, ...), i --> Vi + if (ConcatSrcNumElts == ExtNumElts) + return V.getOperand(ConcatOpIdx); + + // If the concatenated source vectors are a multiple length of this extract, + // then extract a fraction of one of those source vectors directly from a + // concat operand. Example: + // v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 --> + // v2i8 extract_subvec v8i8 Y, 6 + if (ConcatSrcNumElts % ExtNumElts == 0) { + SDLoc DL(N); + unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts; + assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts && + "Trying to extract from >1 concat operand?"); + assert(NewExtIdx % ExtNumElts == 0 && + "Extract index is not a multiple of the input vector length."); + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SDValue NewIndexC = DAG.getConstant(NewExtIdx, DL, IdxTy); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, + V.getOperand(ConcatOpIdx), NewIndexC); + } } V = peekThroughBitcasts(V); @@ -18962,6 +19318,30 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return V; } + // A shuffle of a concat of the same narrow vector can be reduced to use + // only low-half elements of a concat with undef: + // shuf (concat X, X), undef, Mask --> shuf (concat X, undef), undef, Mask' + if (N0.getOpcode() == ISD::CONCAT_VECTORS && N1.isUndef() && + N0.getNumOperands() == 2 && + N0.getOperand(0) == N0.getOperand(1)) { + int HalfNumElts = (int)NumElts / 2; + SmallVector<int, 8> NewMask; + for (unsigned i = 0; i != NumElts; ++i) { + int Idx = SVN->getMaskElt(i); + if (Idx >= HalfNumElts) { + assert(Idx < (int)NumElts && "Shuffle mask chooses undef op"); + Idx -= HalfNumElts; + } + NewMask.push_back(Idx); + } + if (TLI.isShuffleMaskLegal(NewMask, VT)) { + SDValue UndefVec = DAG.getUNDEF(N0.getOperand(0).getValueType()); + SDValue NewCat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, + N0.getOperand(0), UndefVec); + return DAG.getVectorShuffle(VT, SDLoc(N), NewCat, N1, NewMask); + } + } + // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) @@ -19446,8 +19826,10 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { int EltIdx = i / Split; int SubIdx = i % Split; SDValue Elt = RHS.getOperand(EltIdx); + // X & undef --> 0 (not undef). So this lane must be converted to choose + // from the zero constant vector (same as if the element had all 0-bits). if (Elt.isUndef()) { - Indices.push_back(-1); + Indices.push_back(i + NumSubElts); continue; } @@ -19460,14 +19842,10 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { return SDValue(); // Extract the sub element from the constant bit mask. - if (DAG.getDataLayout().isBigEndian()) { - Bits.lshrInPlace((Split - SubIdx - 1) * NumSubBits); - } else { - Bits.lshrInPlace(SubIdx * NumSubBits); - } - - if (Split > 1) - Bits = Bits.trunc(NumSubBits); + if (DAG.getDataLayout().isBigEndian()) + Bits = Bits.extractBits(NumSubBits, (Split - SubIdx - 1) * NumSubBits); + else + Bits = Bits.extractBits(NumSubBits, SubIdx * NumSubBits); if (Bits.isAllOnesValue()) Indices.push_back(i); @@ -19910,22 +20288,28 @@ SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode()); if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) { unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1; - SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); - SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt); - AddToWorklist(Shift.getNode()); - - if (XType.bitsGT(AType)) { - Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); + if (!TLI.shouldAvoidTransformToShift(XType, ShCt)) { + SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); + SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt); AddToWorklist(Shift.getNode()); - } - if (CC == ISD::SETGT) - Shift = DAG.getNOT(DL, Shift, AType); + if (XType.bitsGT(AType)) { + Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); + AddToWorklist(Shift.getNode()); + } - return DAG.getNode(ISD::AND, DL, AType, Shift, N2); + if (CC == ISD::SETGT) + Shift = DAG.getNOT(DL, Shift, AType); + + return DAG.getNode(ISD::AND, DL, AType, Shift, N2); + } } - SDValue ShiftAmt = DAG.getConstant(XType.getSizeInBits() - 1, DL, ShiftAmtTy); + unsigned ShCt = XType.getSizeInBits() - 1; + if (TLI.shouldAvoidTransformToShift(XType, ShCt)) + return SDValue(); + + SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt); AddToWorklist(Shift.getNode()); @@ -20035,31 +20419,29 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, // when the condition can be materialized as an all-ones register. Any // single bit-test can be materialized as an all-ones register with // shift-left and shift-right-arith. - // TODO: The operation legality checks could be loosened to include "custom", - // but that may cause regressions for targets that do not have shift - // instructions. if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND && - N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2) && - TLI.isOperationLegal(ISD::SHL, VT) && - TLI.isOperationLegal(ISD::SRA, VT)) { + N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) { SDValue AndLHS = N0->getOperand(0); auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1)); if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) { // Shift the tested bit over the sign bit. const APInt &AndMask = ConstAndRHS->getAPIntValue(); - SDValue ShlAmt = - DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS), - getShiftAmountTy(AndLHS.getValueType())); - SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt); - - // Now arithmetic right shift it all the way over, so the result is either - // all-ones, or zero. - SDValue ShrAmt = - DAG.getConstant(AndMask.getBitWidth() - 1, SDLoc(Shl), - getShiftAmountTy(Shl.getValueType())); - SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt); - - return DAG.getNode(ISD::AND, DL, VT, Shr, N3); + unsigned ShCt = AndMask.getBitWidth() - 1; + if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) { + SDValue ShlAmt = + DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS), + getShiftAmountTy(AndLHS.getValueType())); + SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt); + + // Now arithmetic right shift it all the way over, so the result is + // either all-ones, or zero. + SDValue ShrAmt = + DAG.getConstant(ShCt, SDLoc(Shl), + getShiftAmountTy(Shl.getValueType())); + SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt); + + return DAG.getNode(ISD::AND, DL, VT, Shr, N3); + } } } @@ -20073,7 +20455,7 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) { if (Swap) { - CC = ISD::getSetCCInverse(CC, CmpOpVT.isInteger()); + CC = ISD::getSetCCInverse(CC, CmpOpVT); std::swap(N2C, N3C); } @@ -20101,10 +20483,13 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, if (N2C->isOne()) return Temp; + unsigned ShCt = N2C->getAPIntValue().logBase2(); + if (TLI.shouldAvoidTransformToShift(VT, ShCt)) + return SDValue(); + // shl setcc result by log2 n2c return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp, - DAG.getConstant(N2C->getAPIntValue().logBase2(), - SDLoc(Temp), + DAG.getConstant(ShCt, SDLoc(Temp), getShiftAmountTy(Temp.getValueType()))); } @@ -20237,7 +20622,7 @@ SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { /// Result = N X_i + X_i (N - N A X_i) SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags) { - if (Level >= AfterLegalizeDAG) + if (LegalDAG) return SDValue(); // TODO: Handle half and/or extended types? @@ -20376,7 +20761,7 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est, /// Op can be zero. SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Reciprocal) { - if (Level >= AfterLegalizeDAG) + if (LegalDAG) return SDValue(); // TODO: Handle half and/or extended types? @@ -20411,9 +20796,8 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, SDLoc DL(Op); EVT CCVT = getSetCCResultType(VT); ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; - const Function &F = DAG.getMachineFunction().getFunction(); - Attribute Denorms = F.getFnAttribute("denormal-fp-math"); - if (Denorms.getValueAsString().equals("ieee")) { + DenormalMode DenormMode = DAG.getDenormalMode(VT); + if (DenormMode == DenormalMode::IEEE) { // fabs(X) < SmallestNormal ? 0.0 : Est const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); |