diff options
Diffstat (limited to 'llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp')
| -rw-r--r-- | llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 1790 |
1 files changed, 1143 insertions, 647 deletions
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 06d827de2e96..3a52959d54bf 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -12,9 +12,11 @@ #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -26,8 +28,10 @@ #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/DivisionByConstantInfo.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetMachine.h" #include <tuple> #define DEBUG_TYPE "gi-combiner" @@ -46,8 +50,9 @@ CombinerHelper::CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B, GISelKnownBits *KB, MachineDominatorTree *MDT, const LegalizerInfo *LI) - : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), - KB(KB), MDT(MDT), LI(LI) { + : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), KB(KB), + MDT(MDT), LI(LI), RBI(Builder.getMF().getSubtarget().getRegBankInfo()), + TRI(Builder.getMF().getSubtarget().getRegisterInfo()) { (void)this->KB; } @@ -64,6 +69,16 @@ static unsigned littleEndianByteAt(const unsigned ByteWidth, const unsigned I) { return I; } +/// Determines the LogBase2 value for a non-null input value using the +/// transform: LogBase2(V) = (EltBits - 1) - ctlz(V). +static Register buildLogBase2(Register V, MachineIRBuilder &MIB) { + auto &MRI = *MIB.getMRI(); + LLT Ty = MRI.getType(V); + auto Ctlz = MIB.buildCTLZ(Ty, V); + auto Base = MIB.buildConstant(Ty, Ty.getScalarSizeInBits() - 1); + return MIB.buildSub(Ty, Base, Ctlz).getReg(0); +} + /// \returns The big endian in-memory byte position of byte \p I in a /// \p ByteWidth bytes wide type. /// @@ -143,6 +158,24 @@ void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI, Observer.changedInstr(*FromRegOp.getParent()); } +void CombinerHelper::replaceOpcodeWith(MachineInstr &FromMI, + unsigned ToOpcode) const { + Observer.changingInstr(FromMI); + + FromMI.setDesc(Builder.getTII().get(ToOpcode)); + + Observer.changedInstr(FromMI); +} + +const RegisterBank *CombinerHelper::getRegBank(Register Reg) const { + return RBI->getRegBank(Reg, MRI, *TRI); +} + +void CombinerHelper::setRegBank(Register Reg, const RegisterBank *RegBank) { + if (RegBank) + MRI.setRegBank(Reg, *RegBank); +} + bool CombinerHelper::tryCombineCopy(MachineInstr &MI) { if (matchCombineCopy(MI)) { applyCombineCopy(MI); @@ -486,10 +519,7 @@ bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI, continue; // Check for legality. if (LI) { - LegalityQuery::MemDesc MMDesc; - MMDesc.MemoryTy = MMO.getMemoryType(); - MMDesc.AlignInBits = MMO.getAlign().value() * 8; - MMDesc.Ordering = MMO.getSuccessOrdering(); + LegalityQuery::MemDesc MMDesc(MMO); LLT UseTy = MRI.getType(UseMI.getOperand(0).getReg()); LLT SrcTy = MRI.getType(LoadMI->getPointerReg()); if (LI->getAction({LoadMI->getOpcode(), {UseTy, SrcTy}, {MMDesc}}) @@ -623,13 +653,83 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI, Observer.changedInstr(MI); } +bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI, + BuildFnTy &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_AND); + + // If we have the following code: + // %mask = G_CONSTANT 255 + // %ld = G_LOAD %ptr, (load s16) + // %and = G_AND %ld, %mask + // + // Try to fold it into + // %ld = G_ZEXTLOAD %ptr, (load s8) + + Register Dst = MI.getOperand(0).getReg(); + if (MRI.getType(Dst).isVector()) + return false; + + auto MaybeMask = + getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + if (!MaybeMask) + return false; + + APInt MaskVal = MaybeMask->Value; + + if (!MaskVal.isMask()) + return false; + + Register SrcReg = MI.getOperand(1).getReg(); + GAnyLoad *LoadMI = getOpcodeDef<GAnyLoad>(SrcReg, MRI); + if (!LoadMI || !MRI.hasOneNonDBGUse(LoadMI->getDstReg()) || + !LoadMI->isSimple()) + return false; + + Register LoadReg = LoadMI->getDstReg(); + LLT LoadTy = MRI.getType(LoadReg); + Register PtrReg = LoadMI->getPointerReg(); + uint64_t LoadSizeBits = LoadMI->getMemSizeInBits(); + unsigned MaskSizeBits = MaskVal.countTrailingOnes(); + + // The mask may not be larger than the in-memory type, as it might cover sign + // extended bits + if (MaskSizeBits > LoadSizeBits) + return false; + + // If the mask covers the whole destination register, there's nothing to + // extend + if (MaskSizeBits >= LoadTy.getSizeInBits()) + return false; + + // Most targets cannot deal with loads of size < 8 and need to re-legalize to + // at least byte loads. Avoid creating such loads here + if (MaskSizeBits < 8 || !isPowerOf2_32(MaskSizeBits)) + return false; + + const MachineMemOperand &MMO = LoadMI->getMMO(); + LegalityQuery::MemDesc MemDesc(MMO); + MemDesc.MemoryTy = LLT::scalar(MaskSizeBits); + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_ZEXTLOAD, {LoadTy, MRI.getType(PtrReg)}, {MemDesc}})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.setInstrAndDebugLoc(*LoadMI); + auto &MF = B.getMF(); + auto PtrInfo = MMO.getPointerInfo(); + auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, MaskSizeBits / 8); + B.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, Dst, PtrReg, *NewMMO); + }; + return true; +} + bool CombinerHelper::isPredecessor(const MachineInstr &DefMI, const MachineInstr &UseMI) { assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() && "shouldn't consider debug uses"); assert(DefMI.getParent() == UseMI.getParent()); if (&DefMI == &UseMI) - return false; + return true; const MachineBasicBlock &MBB = *DefMI.getParent(); auto DefOrUse = find_if(MBB, [&DefMI, &UseMI](const MachineInstr &MI) { return &MI == &DefMI || &MI == &UseMI; @@ -711,6 +811,16 @@ bool CombinerHelper::matchSextInRegOfLoad( // anyway for most targets. if (!isPowerOf2_32(NewSizeBits)) return false; + + const MachineMemOperand &MMO = LoadDef->getMMO(); + LegalityQuery::MemDesc MMDesc(MMO); + MMDesc.MemoryTy = LLT::scalar(NewSizeBits); + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SEXTLOAD, + {MRI.getType(LoadDef->getDstReg()), + MRI.getType(LoadDef->getPointerReg())}, + {MMDesc}})) + return false; + MatchInfo = std::make_tuple(LoadDef->getDstReg(), NewSizeBits); return true; } @@ -1093,81 +1203,6 @@ void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI, Observer.changedInstr(*BrCond); } -static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { - // On Darwin, -Os means optimize for size without hurting performance, so - // only really optimize for size when -Oz (MinSize) is used. - if (MF.getTarget().getTargetTriple().isOSDarwin()) - return MF.getFunction().hasMinSize(); - return MF.getFunction().hasOptSize(); -} - -// Returns a list of types to use for memory op lowering in MemOps. A partial -// port of findOptimalMemOpLowering in TargetLowering. -static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps, - unsigned Limit, const MemOp &Op, - unsigned DstAS, unsigned SrcAS, - const AttributeList &FuncAttributes, - const TargetLowering &TLI) { - if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign()) - return false; - - LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes); - - if (Ty == LLT()) { - // Use the largest scalar type whose alignment constraints are satisfied. - // We only need to check DstAlign here as SrcAlign is always greater or - // equal to DstAlign (or zero). - Ty = LLT::scalar(64); - if (Op.isFixedDstAlign()) - while (Op.getDstAlign() < Ty.getSizeInBytes() && - !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign())) - Ty = LLT::scalar(Ty.getSizeInBytes()); - assert(Ty.getSizeInBits() > 0 && "Could not find valid type"); - // FIXME: check for the largest legal type we can load/store to. - } - - unsigned NumMemOps = 0; - uint64_t Size = Op.size(); - while (Size) { - unsigned TySize = Ty.getSizeInBytes(); - while (TySize > Size) { - // For now, only use non-vector load / store's for the left-over pieces. - LLT NewTy = Ty; - // FIXME: check for mem op safety and legality of the types. Not all of - // SDAGisms map cleanly to GISel concepts. - if (NewTy.isVector()) - NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32); - NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1)); - unsigned NewTySize = NewTy.getSizeInBytes(); - assert(NewTySize > 0 && "Could not find appropriate type"); - - // If the new LLT cannot cover all of the remaining bits, then consider - // issuing a (or a pair of) unaligned and overlapping load / store. - bool Fast; - // Need to get a VT equivalent for allowMisalignedMemoryAccesses(). - MVT VT = getMVTForLLT(Ty); - if (NumMemOps && Op.allowOverlap() && NewTySize < Size && - TLI.allowsMisalignedMemoryAccesses( - VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1), - MachineMemOperand::MONone, &Fast) && - Fast) - TySize = Size; - else { - Ty = NewTy; - TySize = NewTySize; - } - } - - if (++NumMemOps > Limit) - return false; - - MemOps.push_back(Ty); - Size -= TySize; - } - - return true; -} - static Type *getTypeForLLT(LLT Ty, LLVMContext &C) { if (Ty.isVector()) return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()), @@ -1175,460 +1210,20 @@ static Type *getTypeForLLT(LLT Ty, LLVMContext &C) { return IntegerType::get(C, Ty.getSizeInBits()); } -// Get a vectorized representation of the memset value operand, GISel edition. -static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) { - MachineRegisterInfo &MRI = *MIB.getMRI(); - unsigned NumBits = Ty.getScalarSizeInBits(); - auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI); - if (!Ty.isVector() && ValVRegAndVal) { - APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8); - APInt SplatVal = APInt::getSplat(NumBits, Scalar); - return MIB.buildConstant(Ty, SplatVal).getReg(0); - } - - // Extend the byte value to the larger type, and then multiply by a magic - // value 0x010101... in order to replicate it across every byte. - // Unless it's zero, in which case just emit a larger G_CONSTANT 0. - if (ValVRegAndVal && ValVRegAndVal->Value == 0) { - return MIB.buildConstant(Ty, 0).getReg(0); - } - - LLT ExtType = Ty.getScalarType(); - auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val); - if (NumBits > 8) { - APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01)); - auto MagicMI = MIB.buildConstant(ExtType, Magic); - Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0); - } - - // For vector types create a G_BUILD_VECTOR. - if (Ty.isVector()) - Val = MIB.buildSplatVector(Ty, Val).getReg(0); - - return Val; -} - -bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, - Register Val, uint64_t KnownLen, - Align Alignment, bool IsVolatile) { - auto &MF = *MI.getParent()->getParent(); - const auto &TLI = *MF.getSubtarget().getTargetLowering(); - auto &DL = MF.getDataLayout(); - LLVMContext &C = MF.getFunction().getContext(); - - assert(KnownLen != 0 && "Have a zero length memset length!"); - - bool DstAlignCanChange = false; - MachineFrameInfo &MFI = MF.getFrameInfo(); - bool OptSize = shouldLowerMemFuncForSize(MF); - - MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); - if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) - DstAlignCanChange = true; - - unsigned Limit = TLI.getMaxStoresPerMemset(OptSize); - std::vector<LLT> MemOps; - - const auto &DstMMO = **MI.memoperands_begin(); - MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); - - auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI); - bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0; - - if (!findGISelOptimalMemOpLowering(MemOps, Limit, - MemOp::Set(KnownLen, DstAlignCanChange, - Alignment, - /*IsZeroMemset=*/IsZeroVal, - /*IsVolatile=*/IsVolatile), - DstPtrInfo.getAddrSpace(), ~0u, - MF.getFunction().getAttributes(), TLI)) - return false; - - if (DstAlignCanChange) { - // Get an estimate of the type from the LLT. - Type *IRTy = getTypeForLLT(MemOps[0], C); - Align NewAlign = DL.getABITypeAlign(IRTy); - if (NewAlign > Alignment) { - Alignment = NewAlign; - unsigned FI = FIDef->getOperand(1).getIndex(); - // Give the stack frame object a larger alignment if needed. - if (MFI.getObjectAlign(FI) < Alignment) - MFI.setObjectAlignment(FI, Alignment); - } - } - - MachineIRBuilder MIB(MI); - // Find the largest store and generate the bit pattern for it. - LLT LargestTy = MemOps[0]; - for (unsigned i = 1; i < MemOps.size(); i++) - if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits()) - LargestTy = MemOps[i]; - - // The memset stored value is always defined as an s8, so in order to make it - // work with larger store types we need to repeat the bit pattern across the - // wider type. - Register MemSetValue = getMemsetValue(Val, LargestTy, MIB); - - if (!MemSetValue) - return false; - - // Generate the stores. For each store type in the list, we generate the - // matching store of that type to the destination address. - LLT PtrTy = MRI.getType(Dst); - unsigned DstOff = 0; - unsigned Size = KnownLen; - for (unsigned I = 0; I < MemOps.size(); I++) { - LLT Ty = MemOps[I]; - unsigned TySize = Ty.getSizeInBytes(); - if (TySize > Size) { - // Issuing an unaligned load / store pair that overlaps with the previous - // pair. Adjust the offset accordingly. - assert(I == MemOps.size() - 1 && I != 0); - DstOff -= TySize - Size; - } - - // If this store is smaller than the largest store see whether we can get - // the smaller value for free with a truncate. - Register Value = MemSetValue; - if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) { - MVT VT = getMVTForLLT(Ty); - MVT LargestVT = getMVTForLLT(LargestTy); - if (!LargestTy.isVector() && !Ty.isVector() && - TLI.isTruncateFree(LargestVT, VT)) - Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0); - else - Value = getMemsetValue(Val, Ty, MIB); - if (!Value) - return false; - } - - auto *StoreMMO = - MF.getMachineMemOperand(&DstMMO, DstOff, Ty); - - Register Ptr = Dst; - if (DstOff != 0) { - auto Offset = - MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); - Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); - } - - MIB.buildStore(Value, Ptr, *StoreMMO); - DstOff += Ty.getSizeInBytes(); - Size -= TySize; - } - - MI.eraseFromParent(); - return true; -} - bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) { - assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE); - - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - Register Len = MI.getOperand(2).getReg(); - - const auto *MMOIt = MI.memoperands_begin(); - const MachineMemOperand *MemOp = *MMOIt; - bool IsVolatile = MemOp->isVolatile(); - - // See if this is a constant length copy - auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI); - // FIXME: support dynamically sized G_MEMCPY_INLINE - assert(LenVRegAndVal.hasValue() && - "inline memcpy with dynamic size is not yet supported"); - uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue(); - if (KnownLen == 0) { - MI.eraseFromParent(); - return true; - } - - const auto &DstMMO = **MI.memoperands_begin(); - const auto &SrcMMO = **std::next(MI.memoperands_begin()); - Align DstAlign = DstMMO.getBaseAlign(); - Align SrcAlign = SrcMMO.getBaseAlign(); - - return tryEmitMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, - IsVolatile); -} - -bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI, Register Dst, - Register Src, uint64_t KnownLen, - Align DstAlign, Align SrcAlign, - bool IsVolatile) { - assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE); - return optimizeMemcpy(MI, Dst, Src, KnownLen, - std::numeric_limits<uint64_t>::max(), DstAlign, - SrcAlign, IsVolatile); -} - -bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst, - Register Src, uint64_t KnownLen, - uint64_t Limit, Align DstAlign, - Align SrcAlign, bool IsVolatile) { - auto &MF = *MI.getParent()->getParent(); - const auto &TLI = *MF.getSubtarget().getTargetLowering(); - auto &DL = MF.getDataLayout(); - LLVMContext &C = MF.getFunction().getContext(); - - assert(KnownLen != 0 && "Have a zero length memcpy length!"); - - bool DstAlignCanChange = false; - MachineFrameInfo &MFI = MF.getFrameInfo(); - Align Alignment = commonAlignment(DstAlign, SrcAlign); - - MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); - if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) - DstAlignCanChange = true; - - // FIXME: infer better src pointer alignment like SelectionDAG does here. - // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining - // if the memcpy is in a tail call position. - - std::vector<LLT> MemOps; - - const auto &DstMMO = **MI.memoperands_begin(); - const auto &SrcMMO = **std::next(MI.memoperands_begin()); - MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); - MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); - - if (!findGISelOptimalMemOpLowering( - MemOps, Limit, - MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign, - IsVolatile), - DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), - MF.getFunction().getAttributes(), TLI)) - return false; - - if (DstAlignCanChange) { - // Get an estimate of the type from the LLT. - Type *IRTy = getTypeForLLT(MemOps[0], C); - Align NewAlign = DL.getABITypeAlign(IRTy); - - // Don't promote to an alignment that would require dynamic stack - // realignment. - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - if (!TRI->hasStackRealignment(MF)) - while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) - NewAlign = NewAlign / 2; - - if (NewAlign > Alignment) { - Alignment = NewAlign; - unsigned FI = FIDef->getOperand(1).getIndex(); - // Give the stack frame object a larger alignment if needed. - if (MFI.getObjectAlign(FI) < Alignment) - MFI.setObjectAlignment(FI, Alignment); - } - } - - LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n"); - - MachineIRBuilder MIB(MI); - // Now we need to emit a pair of load and stores for each of the types we've - // collected. I.e. for each type, generate a load from the source pointer of - // that type width, and then generate a corresponding store to the dest buffer - // of that value loaded. This can result in a sequence of loads and stores - // mixed types, depending on what the target specifies as good types to use. - unsigned CurrOffset = 0; - LLT PtrTy = MRI.getType(Src); - unsigned Size = KnownLen; - for (auto CopyTy : MemOps) { - // Issuing an unaligned load / store pair that overlaps with the previous - // pair. Adjust the offset accordingly. - if (CopyTy.getSizeInBytes() > Size) - CurrOffset -= CopyTy.getSizeInBytes() - Size; - - // Construct MMOs for the accesses. - auto *LoadMMO = - MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); - auto *StoreMMO = - MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); - - // Create the load. - Register LoadPtr = Src; - Register Offset; - if (CurrOffset != 0) { - Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset) - .getReg(0); - LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0); - } - auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); - - // Create the store. - Register StorePtr = - CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); - MIB.buildStore(LdVal, StorePtr, *StoreMMO); - CurrOffset += CopyTy.getSizeInBytes(); - Size -= CopyTy.getSizeInBytes(); - } - - MI.eraseFromParent(); - return true; -} - -bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst, - Register Src, uint64_t KnownLen, - Align DstAlign, Align SrcAlign, - bool IsVolatile) { - auto &MF = *MI.getParent()->getParent(); - const auto &TLI = *MF.getSubtarget().getTargetLowering(); - auto &DL = MF.getDataLayout(); - LLVMContext &C = MF.getFunction().getContext(); - - assert(KnownLen != 0 && "Have a zero length memmove length!"); - - bool DstAlignCanChange = false; - MachineFrameInfo &MFI = MF.getFrameInfo(); - bool OptSize = shouldLowerMemFuncForSize(MF); - Align Alignment = commonAlignment(DstAlign, SrcAlign); - - MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); - if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) - DstAlignCanChange = true; - - unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize); - std::vector<LLT> MemOps; - - const auto &DstMMO = **MI.memoperands_begin(); - const auto &SrcMMO = **std::next(MI.memoperands_begin()); - MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); - MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); - - // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due - // to a bug in it's findOptimalMemOpLowering implementation. For now do the - // same thing here. - if (!findGISelOptimalMemOpLowering( - MemOps, Limit, - MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign, - /*IsVolatile*/ true), - DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), - MF.getFunction().getAttributes(), TLI)) - return false; - - if (DstAlignCanChange) { - // Get an estimate of the type from the LLT. - Type *IRTy = getTypeForLLT(MemOps[0], C); - Align NewAlign = DL.getABITypeAlign(IRTy); - - // Don't promote to an alignment that would require dynamic stack - // realignment. - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - if (!TRI->hasStackRealignment(MF)) - while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) - NewAlign = NewAlign / 2; - - if (NewAlign > Alignment) { - Alignment = NewAlign; - unsigned FI = FIDef->getOperand(1).getIndex(); - // Give the stack frame object a larger alignment if needed. - if (MFI.getObjectAlign(FI) < Alignment) - MFI.setObjectAlignment(FI, Alignment); - } - } - - LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n"); - - MachineIRBuilder MIB(MI); - // Memmove requires that we perform the loads first before issuing the stores. - // Apart from that, this loop is pretty much doing the same thing as the - // memcpy codegen function. - unsigned CurrOffset = 0; - LLT PtrTy = MRI.getType(Src); - SmallVector<Register, 16> LoadVals; - for (auto CopyTy : MemOps) { - // Construct MMO for the load. - auto *LoadMMO = - MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); - - // Create the load. - Register LoadPtr = Src; - if (CurrOffset != 0) { - auto Offset = - MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); - LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0); - } - LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); - CurrOffset += CopyTy.getSizeInBytes(); - } - - CurrOffset = 0; - for (unsigned I = 0; I < MemOps.size(); ++I) { - LLT CopyTy = MemOps[I]; - // Now store the values loaded. - auto *StoreMMO = - MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); - - Register StorePtr = Dst; - if (CurrOffset != 0) { - auto Offset = - MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); - StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); - } - MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); - CurrOffset += CopyTy.getSizeInBytes(); - } - MI.eraseFromParent(); - return true; + MachineIRBuilder HelperBuilder(MI); + GISelObserverWrapper DummyObserver; + LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder); + return Helper.lowerMemcpyInline(MI) == + LegalizerHelper::LegalizeResult::Legalized; } bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { - const unsigned Opc = MI.getOpcode(); - // This combine is fairly complex so it's not written with a separate - // matcher function. - assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE || - Opc == TargetOpcode::G_MEMSET) && "Expected memcpy like instruction"); - - auto MMOIt = MI.memoperands_begin(); - const MachineMemOperand *MemOp = *MMOIt; - - Align DstAlign = MemOp->getBaseAlign(); - Align SrcAlign; - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - Register Len = MI.getOperand(2).getReg(); - - if (Opc != TargetOpcode::G_MEMSET) { - assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI"); - MemOp = *(++MMOIt); - SrcAlign = MemOp->getBaseAlign(); - } - - // See if this is a constant length copy - auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI); - if (!LenVRegAndVal) - return false; // Leave it to the legalizer to lower it to a libcall. - uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue(); - - if (KnownLen == 0) { - MI.eraseFromParent(); - return true; - } - - bool IsVolatile = MemOp->isVolatile(); - if (Opc == TargetOpcode::G_MEMCPY_INLINE) - return tryEmitMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, - IsVolatile); - - // Don't try to optimize volatile. - if (IsVolatile) - return false; - - if (MaxLen && KnownLen > MaxLen) - return false; - - if (Opc == TargetOpcode::G_MEMCPY) { - auto &MF = *MI.getParent()->getParent(); - const auto &TLI = *MF.getSubtarget().getTargetLowering(); - bool OptSize = shouldLowerMemFuncForSize(MF); - uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize); - return optimizeMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign, - IsVolatile); - } - if (Opc == TargetOpcode::G_MEMMOVE) - return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); - if (Opc == TargetOpcode::G_MEMSET) - return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile); - return false; + MachineIRBuilder HelperBuilder(MI); + GISelObserverWrapper DummyObserver; + LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder); + return Helper.lowerMemCpyFamily(MI, MaxLen) == + LegalizerHelper::LegalizeResult::Legalized; } static Optional<APFloat> constantFoldFpUnary(unsigned Opcode, LLT DstTy, @@ -1706,30 +1301,52 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI, Register Add2 = MI.getOperand(1).getReg(); Register Imm1 = MI.getOperand(2).getReg(); - auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI); + auto MaybeImmVal = getIConstantVRegValWithLookThrough(Imm1, MRI); if (!MaybeImmVal) return false; - // Don't do this combine if there multiple uses of the first PTR_ADD, - // since we may be able to compute the second PTR_ADD as an immediate - // offset anyway. Folding the first offset into the second may cause us - // to go beyond the bounds of our legal addressing modes. - if (!MRI.hasOneNonDBGUse(Add2)) - return false; - - MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2); + MachineInstr *Add2Def = MRI.getVRegDef(Add2); if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD) return false; Register Base = Add2Def->getOperand(1).getReg(); Register Imm2 = Add2Def->getOperand(2).getReg(); - auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI); + auto MaybeImm2Val = getIConstantVRegValWithLookThrough(Imm2, MRI); if (!MaybeImm2Val) return false; + // Check if the new combined immediate forms an illegal addressing mode. + // Do not combine if it was legal before but would get illegal. + // To do so, we need to find a load/store user of the pointer to get + // the access type. + Type *AccessTy = nullptr; + auto &MF = *MI.getMF(); + for (auto &UseMI : MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) { + if (auto *LdSt = dyn_cast<GLoadStore>(&UseMI)) { + AccessTy = getTypeForLLT(MRI.getType(LdSt->getReg(0)), + MF.getFunction().getContext()); + break; + } + } + TargetLoweringBase::AddrMode AMNew; + APInt CombinedImm = MaybeImmVal->Value + MaybeImm2Val->Value; + AMNew.BaseOffs = CombinedImm.getSExtValue(); + if (AccessTy) { + AMNew.HasBaseReg = true; + TargetLoweringBase::AddrMode AMOld; + AMOld.BaseOffs = MaybeImm2Val->Value.getSExtValue(); + AMOld.HasBaseReg = true; + unsigned AS = MRI.getType(Add2).getAddressSpace(); + const auto &TLI = *MF.getSubtarget().getTargetLowering(); + if (TLI.isLegalAddressingMode(MF.getDataLayout(), AMOld, AccessTy, AS) && + !TLI.isLegalAddressingMode(MF.getDataLayout(), AMNew, AccessTy, AS)) + return false; + } + // Pass the combined immediate to the apply function. - MatchInfo.Imm = (MaybeImmVal->Value + MaybeImm2Val->Value).getSExtValue(); + MatchInfo.Imm = AMNew.BaseOffs; MatchInfo.Base = Base; + MatchInfo.Bank = getRegBank(Imm2); return true; } @@ -1739,6 +1356,7 @@ void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI, MachineIRBuilder MIB(MI); LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg()); auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm); + setRegBank(NewOffset.getReg(0), MatchInfo.Bank); Observer.changingInstr(MI); MI.getOperand(1).setReg(MatchInfo.Base); MI.getOperand(2).setReg(NewOffset.getReg(0)); @@ -1762,7 +1380,7 @@ bool CombinerHelper::matchShiftImmedChain(MachineInstr &MI, Register Shl2 = MI.getOperand(1).getReg(); Register Imm1 = MI.getOperand(2).getReg(); - auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI); + auto MaybeImmVal = getIConstantVRegValWithLookThrough(Imm1, MRI); if (!MaybeImmVal) return false; @@ -1772,7 +1390,7 @@ bool CombinerHelper::matchShiftImmedChain(MachineInstr &MI, Register Base = Shl2Def->getOperand(1).getReg(); Register Imm2 = Shl2Def->getOperand(2).getReg(); - auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI); + auto MaybeImm2Val = getIConstantVRegValWithLookThrough(Imm2, MRI); if (!MaybeImm2Val) return false; @@ -1856,7 +1474,7 @@ bool CombinerHelper::matchShiftOfShiftedLogic(MachineInstr &MI, // Find a matching one-use shift by constant. const Register C1 = MI.getOperand(2).getReg(); - auto MaybeImmVal = getConstantVRegValWithLookThrough(C1, MRI); + auto MaybeImmVal = getIConstantVRegValWithLookThrough(C1, MRI); if (!MaybeImmVal) return false; @@ -1870,7 +1488,7 @@ bool CombinerHelper::matchShiftOfShiftedLogic(MachineInstr &MI, // Must be a constant. auto MaybeImmVal = - getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); + getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); if (!MaybeImmVal) return false; @@ -1932,8 +1550,8 @@ void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI, Builder.buildInstr(MatchInfo.Logic->getOpcode(), {Dest}, {Shift1, Shift2}); // These were one use so it's safe to remove them. - MatchInfo.Shift2->eraseFromParent(); - MatchInfo.Logic->eraseFromParent(); + MatchInfo.Shift2->eraseFromParentAndMarkDBGValuesForRemoval(); + MatchInfo.Logic->eraseFromParentAndMarkDBGValuesForRemoval(); MI.eraseFromParent(); } @@ -1942,7 +1560,7 @@ bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) { assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); auto MaybeImmVal = - getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); if (!MaybeImmVal) return false; @@ -1977,7 +1595,7 @@ bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI, // TODO: Should handle vector splat. Register RHS = MI.getOperand(2).getReg(); - auto MaybeShiftAmtVal = getConstantVRegValWithLookThrough(RHS, MRI); + auto MaybeShiftAmtVal = getIConstantVRegValWithLookThrough(RHS, MRI); if (!MaybeShiftAmtVal) return false; @@ -2045,26 +1663,23 @@ bool CombinerHelper::matchCombineUnmergeMergeToPlainValues( MachineInstr &MI, SmallVectorImpl<Register> &Operands) { assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && "Expected an unmerge"); - Register SrcReg = - peekThroughBitcast(MI.getOperand(MI.getNumOperands() - 1).getReg(), MRI); + auto &Unmerge = cast<GUnmerge>(MI); + Register SrcReg = peekThroughBitcast(Unmerge.getSourceReg(), MRI); - MachineInstr *SrcInstr = MRI.getVRegDef(SrcReg); - if (SrcInstr->getOpcode() != TargetOpcode::G_MERGE_VALUES && - SrcInstr->getOpcode() != TargetOpcode::G_BUILD_VECTOR && - SrcInstr->getOpcode() != TargetOpcode::G_CONCAT_VECTORS) + auto *SrcInstr = getOpcodeDef<GMergeLikeOp>(SrcReg, MRI); + if (!SrcInstr) return false; // Check the source type of the merge. - LLT SrcMergeTy = MRI.getType(SrcInstr->getOperand(1).getReg()); - LLT Dst0Ty = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcMergeTy = MRI.getType(SrcInstr->getSourceReg(0)); + LLT Dst0Ty = MRI.getType(Unmerge.getReg(0)); bool SameSize = Dst0Ty.getSizeInBits() == SrcMergeTy.getSizeInBits(); if (SrcMergeTy != Dst0Ty && !SameSize) return false; // They are the same now (modulo a bitcast). // We can collect all the src registers. - for (unsigned Idx = 1, EndIdx = SrcInstr->getNumOperands(); Idx != EndIdx; - ++Idx) - Operands.push_back(SrcInstr->getOperand(Idx).getReg()); + for (unsigned Idx = 0; Idx < SrcInstr->getNumSources(); ++Idx) + Operands.push_back(SrcInstr->getSourceReg(Idx)); return true; } @@ -2241,7 +1856,7 @@ bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI, return false; auto MaybeImmVal = - getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); if (!MaybeImmVal) return false; @@ -2410,12 +2025,12 @@ void CombinerHelper::applyCombineAddP2IToPtrAdd( bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI, int64_t &NewCst) { - assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected a G_PTR_ADD"); - Register LHS = MI.getOperand(1).getReg(); - Register RHS = MI.getOperand(2).getReg(); + auto &PtrAdd = cast<GPtrAdd>(MI); + Register LHS = PtrAdd.getBaseReg(); + Register RHS = PtrAdd.getOffsetReg(); MachineRegisterInfo &MRI = Builder.getMF().getRegInfo(); - if (auto RHSCst = getConstantVRegSExtVal(RHS, MRI)) { + if (auto RHSCst = getIConstantVRegSExtVal(RHS, MRI)) { int64_t Cst; if (mi_match(LHS, MRI, m_GIntToPtr(m_ICst(Cst)))) { NewCst = Cst + *RHSCst; @@ -2428,12 +2043,12 @@ bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI, void CombinerHelper::applyCombineConstPtrAddToI2P(MachineInstr &MI, int64_t &NewCst) { - assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected a G_PTR_ADD"); - Register Dst = MI.getOperand(0).getReg(); + auto &PtrAdd = cast<GPtrAdd>(MI); + Register Dst = PtrAdd.getReg(0); Builder.setInstrAndDebugLoc(MI); Builder.buildConstant(Dst, NewCst); - MI.eraseFromParent(); + PtrAdd.eraseFromParent(); } bool CombinerHelper::matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) { @@ -2536,6 +2151,23 @@ bool CombinerHelper::matchCombineFAbsOfFAbs(MachineInstr &MI, Register &Src) { return mi_match(Src, MRI, m_GFabs(m_Reg(AbsSrc))); } +bool CombinerHelper::matchCombineFAbsOfFNeg(MachineInstr &MI, + BuildFnTy &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FABS && "Expected a G_FABS"); + Register Src = MI.getOperand(1).getReg(); + Register NegSrc; + + if (!mi_match(Src, MRI, m_GFNeg(m_Reg(NegSrc)))) + return false; + + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Observer.changingInstr(MI); + MI.getOperand(1).setReg(NegSrc); + Observer.changedInstr(MI); + }; + return true; +} + bool CombinerHelper::matchCombineTruncOfExt( MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC"); @@ -2587,7 +2219,7 @@ bool CombinerHelper::matchCombineTruncOfShl( {DstTy, getTargetLowering().getPreferredShiftAmountTy(DstTy)}})) { KnownBits Known = KB->getKnownBits(ShiftAmt); unsigned Size = DstTy.getSizeInBits(); - if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) { + if (Known.countMaxActiveBits() <= Log2_32(Size)) { MatchInfo = std::make_pair(ShiftSrc, ShiftAmt); return true; } @@ -2644,13 +2276,13 @@ bool CombinerHelper::matchUndefSelectCmp(MachineInstr &MI) { } bool CombinerHelper::matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx) { - assert(MI.getOpcode() == TargetOpcode::G_SELECT); - if (auto MaybeCstCmp = - getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI)) { - OpIdx = MaybeCstCmp->Value.isNullValue() ? 3 : 2; - return true; - } - return false; + GSelect &SelMI = cast<GSelect>(MI); + auto Cst = + isConstantOrConstantSplatVector(*MRI.getVRegDef(SelMI.getCondReg()), MRI); + if (!Cst) + return false; + OpIdx = Cst->isZero() ? 3 : 2; + return true; } bool CombinerHelper::eraseInst(MachineInstr &MI) { @@ -2662,12 +2294,14 @@ bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1, const MachineOperand &MOP2) { if (!MOP1.isReg() || !MOP2.isReg()) return false; - MachineInstr *I1 = getDefIgnoringCopies(MOP1.getReg(), MRI); - if (!I1) + auto InstAndDef1 = getDefSrcRegIgnoringCopies(MOP1.getReg(), MRI); + if (!InstAndDef1) return false; - MachineInstr *I2 = getDefIgnoringCopies(MOP2.getReg(), MRI); - if (!I2) + auto InstAndDef2 = getDefSrcRegIgnoringCopies(MOP2.getReg(), MRI); + if (!InstAndDef2) return false; + MachineInstr *I1 = InstAndDef1->MI; + MachineInstr *I2 = InstAndDef2->MI; // Handle a case like this: // @@ -2727,15 +2361,26 @@ bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1, // // On the off-chance that there's some target instruction feeding into the // instruction, let's use produceSameValue instead of isIdenticalTo. - return Builder.getTII().produceSameValue(*I1, *I2, &MRI); + if (Builder.getTII().produceSameValue(*I1, *I2, &MRI)) { + // Handle instructions with multiple defs that produce same values. Values + // are same for operands with same index. + // %0:_(s8), %1:_(s8), %2:_(s8), %3:_(s8) = G_UNMERGE_VALUES %4:_(<4 x s8>) + // %5:_(s8), %6:_(s8), %7:_(s8), %8:_(s8) = G_UNMERGE_VALUES %4:_(<4 x s8>) + // I1 and I2 are different instructions but produce same values, + // %1 and %6 are same, %1 and %7 are not the same value. + return I1->findRegisterDefOperandIdx(InstAndDef1->Reg) == + I2->findRegisterDefOperandIdx(InstAndDef2->Reg); + } + return false; } bool CombinerHelper::matchConstantOp(const MachineOperand &MOP, int64_t C) { if (!MOP.isReg()) return false; - // MIPatternMatch doesn't let us look through G_ZEXT etc. - auto ValAndVReg = getConstantVRegValWithLookThrough(MOP.getReg(), MRI); - return ValAndVReg && ValAndVReg->Value == C; + auto *MI = MRI.getVRegDef(MOP.getReg()); + auto MaybeCst = isConstantOrConstantSplatVector(*MI, MRI); + return MaybeCst.hasValue() && MaybeCst->getBitWidth() <= 64 && + MaybeCst->getSExtValue() == C; } bool CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI, @@ -3115,14 +2760,14 @@ bool CombinerHelper::matchRedundantAnd(MachineInstr &MI, // // Check if we can replace AndDst with the LHS of the G_AND if (canReplaceReg(AndDst, LHS, MRI) && - (LHSBits.Zero | RHSBits.One).isAllOnesValue()) { + (LHSBits.Zero | RHSBits.One).isAllOnes()) { Replacement = LHS; return true; } // Check if we can replace AndDst with the RHS of the G_AND if (canReplaceReg(AndDst, RHS, MRI) && - (LHSBits.One | RHSBits.Zero).isAllOnesValue()) { + (LHSBits.One | RHSBits.Zero).isAllOnes()) { Replacement = RHS; return true; } @@ -3161,14 +2806,14 @@ bool CombinerHelper::matchRedundantOr(MachineInstr &MI, Register &Replacement) { // // Check if we can replace OrDst with the LHS of the G_OR if (canReplaceReg(OrDst, LHS, MRI) && - (LHSBits.One | RHSBits.Zero).isAllOnesValue()) { + (LHSBits.One | RHSBits.Zero).isAllOnes()) { Replacement = LHS; return true; } // Check if we can replace OrDst with the RHS of the G_OR if (canReplaceReg(OrDst, RHS, MRI) && - (LHSBits.Zero | RHSBits.One).isAllOnesValue()) { + (LHSBits.Zero | RHSBits.One).isAllOnes()) { Replacement = RHS; return true; } @@ -3346,7 +2991,8 @@ void CombinerHelper::applyXorOfAndWithSameReg( } bool CombinerHelper::matchPtrAddZero(MachineInstr &MI) { - Register DstReg = MI.getOperand(0).getReg(); + auto &PtrAdd = cast<GPtrAdd>(MI); + Register DstReg = PtrAdd.getReg(0); LLT Ty = MRI.getType(DstReg); const DataLayout &DL = Builder.getMF().getDataLayout(); @@ -3354,20 +3000,20 @@ bool CombinerHelper::matchPtrAddZero(MachineInstr &MI) { return false; if (Ty.isPointer()) { - auto ConstVal = getConstantVRegVal(MI.getOperand(1).getReg(), MRI); + auto ConstVal = getIConstantVRegVal(PtrAdd.getBaseReg(), MRI); return ConstVal && *ConstVal == 0; } assert(Ty.isVector() && "Expecting a vector type"); - const MachineInstr *VecMI = MRI.getVRegDef(MI.getOperand(1).getReg()); + const MachineInstr *VecMI = MRI.getVRegDef(PtrAdd.getBaseReg()); return isBuildVectorAllZeros(*VecMI, MRI); } void CombinerHelper::applyPtrAddZero(MachineInstr &MI) { - assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD); - Builder.setInstrAndDebugLoc(MI); - Builder.buildIntToPtr(MI.getOperand(0), MI.getOperand(2)); - MI.eraseFromParent(); + auto &PtrAdd = cast<GPtrAdd>(MI); + Builder.setInstrAndDebugLoc(PtrAdd); + Builder.buildIntToPtr(PtrAdd.getReg(0), PtrAdd.getOffsetReg()); + PtrAdd.eraseFromParent(); } /// The second source operand is known to be a power of 2. @@ -3704,10 +3350,8 @@ bool CombinerHelper::matchLoadOrCombine( // may not use index 0. Register Ptr = LowestIdxLoad->getPointerReg(); const MachineMemOperand &MMO = LowestIdxLoad->getMMO(); - LegalityQuery::MemDesc MMDesc; + LegalityQuery::MemDesc MMDesc(MMO); MMDesc.MemoryTy = Ty; - MMDesc.AlignInBits = MMO.getAlign().value() * 8; - MMDesc.Ordering = MMO.getSuccessOrdering(); if (!isLegalOrBeforeLegalizer( {TargetOpcode::G_LOAD, {Ty, MRI.getType(Ptr)}, {MMDesc}})) return false; @@ -3732,6 +3376,274 @@ bool CombinerHelper::matchLoadOrCombine( return true; } +/// Check if the store \p Store is a truncstore that can be merged. That is, +/// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty +/// Register then it does not need to match and SrcVal is set to the source +/// value found. +/// On match, returns the start byte offset of the \p SrcVal that is being +/// stored. +static Optional<int64_t> getTruncStoreByteOffset(GStore &Store, Register &SrcVal, + MachineRegisterInfo &MRI) { + Register TruncVal; + if (!mi_match(Store.getValueReg(), MRI, m_GTrunc(m_Reg(TruncVal)))) + return None; + + // The shift amount must be a constant multiple of the narrow type. + // It is translated to the offset address in the wide source value "y". + // + // x = G_LSHR y, ShiftAmtC + // s8 z = G_TRUNC x + // store z, ... + Register FoundSrcVal; + int64_t ShiftAmt; + if (!mi_match(TruncVal, MRI, + m_any_of(m_GLShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt)), + m_GAShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt))))) { + if (!SrcVal.isValid() || TruncVal == SrcVal) { + if (!SrcVal.isValid()) + SrcVal = TruncVal; + return 0; // If it's the lowest index store. + } + return None; + } + + unsigned NarrowBits = Store.getMMO().getMemoryType().getScalarSizeInBits(); + if (ShiftAmt % NarrowBits!= 0) + return None; + const unsigned Offset = ShiftAmt / NarrowBits; + + if (SrcVal.isValid() && FoundSrcVal != SrcVal) + return None; + + if (!SrcVal.isValid()) + SrcVal = FoundSrcVal; + else if (MRI.getType(SrcVal) != MRI.getType(FoundSrcVal)) + return None; + return Offset; +} + +/// Match a pattern where a wide type scalar value is stored by several narrow +/// stores. Fold it into a single store or a BSWAP and a store if the targets +/// supports it. +/// +/// Assuming little endian target: +/// i8 *p = ... +/// i32 val = ... +/// p[0] = (val >> 0) & 0xFF; +/// p[1] = (val >> 8) & 0xFF; +/// p[2] = (val >> 16) & 0xFF; +/// p[3] = (val >> 24) & 0xFF; +/// => +/// *((i32)p) = val; +/// +/// i8 *p = ... +/// i32 val = ... +/// p[0] = (val >> 24) & 0xFF; +/// p[1] = (val >> 16) & 0xFF; +/// p[2] = (val >> 8) & 0xFF; +/// p[3] = (val >> 0) & 0xFF; +/// => +/// *((i32)p) = BSWAP(val); +bool CombinerHelper::matchTruncStoreMerge(MachineInstr &MI, + MergeTruncStoresInfo &MatchInfo) { + auto &StoreMI = cast<GStore>(MI); + LLT MemTy = StoreMI.getMMO().getMemoryType(); + + // We only handle merging simple stores of 1-4 bytes. + if (!MemTy.isScalar()) + return false; + switch (MemTy.getSizeInBits()) { + case 8: + case 16: + case 32: + break; + default: + return false; + } + if (!StoreMI.isSimple()) + return false; + + // We do a simple search for mergeable stores prior to this one. + // Any potential alias hazard along the way terminates the search. + SmallVector<GStore *> FoundStores; + + // We're looking for: + // 1) a (store(trunc(...))) + // 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get + // the partial value stored. + // 3) where the offsets form either a little or big-endian sequence. + + auto &LastStore = StoreMI; + + // The single base pointer that all stores must use. + Register BaseReg; + int64_t LastOffset; + if (!mi_match(LastStore.getPointerReg(), MRI, + m_GPtrAdd(m_Reg(BaseReg), m_ICst(LastOffset)))) { + BaseReg = LastStore.getPointerReg(); + LastOffset = 0; + } + + GStore *LowestIdxStore = &LastStore; + int64_t LowestIdxOffset = LastOffset; + + Register WideSrcVal; + auto LowestShiftAmt = getTruncStoreByteOffset(LastStore, WideSrcVal, MRI); + if (!LowestShiftAmt) + return false; // Didn't match a trunc. + assert(WideSrcVal.isValid()); + + LLT WideStoreTy = MRI.getType(WideSrcVal); + // The wide type might not be a multiple of the memory type, e.g. s48 and s32. + if (WideStoreTy.getSizeInBits() % MemTy.getSizeInBits() != 0) + return false; + const unsigned NumStoresRequired = + WideStoreTy.getSizeInBits() / MemTy.getSizeInBits(); + + SmallVector<int64_t, 8> OffsetMap(NumStoresRequired, INT64_MAX); + OffsetMap[*LowestShiftAmt] = LastOffset; + FoundStores.emplace_back(&LastStore); + + // Search the block up for more stores. + // We use a search threshold of 10 instructions here because the combiner + // works top-down within a block, and we don't want to search an unbounded + // number of predecessor instructions trying to find matching stores. + // If we moved this optimization into a separate pass then we could probably + // use a more efficient search without having a hard-coded threshold. + const int MaxInstsToCheck = 10; + int NumInstsChecked = 0; + for (auto II = ++LastStore.getReverseIterator(); + II != LastStore.getParent()->rend() && NumInstsChecked < MaxInstsToCheck; + ++II) { + NumInstsChecked++; + GStore *NewStore; + if ((NewStore = dyn_cast<GStore>(&*II))) { + if (NewStore->getMMO().getMemoryType() != MemTy || !NewStore->isSimple()) + break; + } else if (II->isLoadFoldBarrier() || II->mayLoad()) { + break; + } else { + continue; // This is a safe instruction we can look past. + } + + Register NewBaseReg; + int64_t MemOffset; + // Check we're storing to the same base + some offset. + if (!mi_match(NewStore->getPointerReg(), MRI, + m_GPtrAdd(m_Reg(NewBaseReg), m_ICst(MemOffset)))) { + NewBaseReg = NewStore->getPointerReg(); + MemOffset = 0; + } + if (BaseReg != NewBaseReg) + break; + + auto ShiftByteOffset = getTruncStoreByteOffset(*NewStore, WideSrcVal, MRI); + if (!ShiftByteOffset) + break; + if (MemOffset < LowestIdxOffset) { + LowestIdxOffset = MemOffset; + LowestIdxStore = NewStore; + } + + // Map the offset in the store and the offset in the combined value, and + // early return if it has been set before. + if (*ShiftByteOffset < 0 || *ShiftByteOffset >= NumStoresRequired || + OffsetMap[*ShiftByteOffset] != INT64_MAX) + break; + OffsetMap[*ShiftByteOffset] = MemOffset; + + FoundStores.emplace_back(NewStore); + // Reset counter since we've found a matching inst. + NumInstsChecked = 0; + if (FoundStores.size() == NumStoresRequired) + break; + } + + if (FoundStores.size() != NumStoresRequired) { + return false; + } + + const auto &DL = LastStore.getMF()->getDataLayout(); + auto &C = LastStore.getMF()->getFunction().getContext(); + // Check that a store of the wide type is both allowed and fast on the target + bool Fast = false; + bool Allowed = getTargetLowering().allowsMemoryAccess( + C, DL, WideStoreTy, LowestIdxStore->getMMO(), &Fast); + if (!Allowed || !Fast) + return false; + + // Check if the pieces of the value are going to the expected places in memory + // to merge the stores. + unsigned NarrowBits = MemTy.getScalarSizeInBits(); + auto checkOffsets = [&](bool MatchLittleEndian) { + if (MatchLittleEndian) { + for (unsigned i = 0; i != NumStoresRequired; ++i) + if (OffsetMap[i] != i * (NarrowBits / 8) + LowestIdxOffset) + return false; + } else { // MatchBigEndian by reversing loop counter. + for (unsigned i = 0, j = NumStoresRequired - 1; i != NumStoresRequired; + ++i, --j) + if (OffsetMap[j] != i * (NarrowBits / 8) + LowestIdxOffset) + return false; + } + return true; + }; + + // Check if the offsets line up for the native data layout of this target. + bool NeedBswap = false; + bool NeedRotate = false; + if (!checkOffsets(DL.isLittleEndian())) { + // Special-case: check if byte offsets line up for the opposite endian. + if (NarrowBits == 8 && checkOffsets(DL.isBigEndian())) + NeedBswap = true; + else if (NumStoresRequired == 2 && checkOffsets(DL.isBigEndian())) + NeedRotate = true; + else + return false; + } + + if (NeedBswap && + !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {WideStoreTy}})) + return false; + if (NeedRotate && + !isLegalOrBeforeLegalizer({TargetOpcode::G_ROTR, {WideStoreTy}})) + return false; + + MatchInfo.NeedBSwap = NeedBswap; + MatchInfo.NeedRotate = NeedRotate; + MatchInfo.LowestIdxStore = LowestIdxStore; + MatchInfo.WideSrcVal = WideSrcVal; + MatchInfo.FoundStores = std::move(FoundStores); + return true; +} + +void CombinerHelper::applyTruncStoreMerge(MachineInstr &MI, + MergeTruncStoresInfo &MatchInfo) { + + Builder.setInstrAndDebugLoc(MI); + Register WideSrcVal = MatchInfo.WideSrcVal; + LLT WideStoreTy = MRI.getType(WideSrcVal); + + if (MatchInfo.NeedBSwap) { + WideSrcVal = Builder.buildBSwap(WideStoreTy, WideSrcVal).getReg(0); + } else if (MatchInfo.NeedRotate) { + assert(WideStoreTy.getSizeInBits() % 2 == 0 && + "Unexpected type for rotate"); + auto RotAmt = + Builder.buildConstant(WideStoreTy, WideStoreTy.getSizeInBits() / 2); + WideSrcVal = + Builder.buildRotateRight(WideStoreTy, WideSrcVal, RotAmt).getReg(0); + } + + Builder.buildStore(WideSrcVal, MatchInfo.LowestIdxStore->getPointerReg(), + MatchInfo.LowestIdxStore->getMMO().getPointerInfo(), + MatchInfo.LowestIdxStore->getMMO().getAlign()); + + // Erase the old stores. + for (auto *ST : MatchInfo.FoundStores) + ST->eraseFromParent(); +} + bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI) { assert(MI.getOpcode() == TargetOpcode::G_PHI); @@ -3844,7 +3756,7 @@ bool CombinerHelper::matchExtractVecEltBuildVec(MachineInstr &MI, {TargetOpcode::G_BUILD_VECTOR, {SrcTy, SrcTy.getElementType()}})) return false; - auto Cst = getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + auto Cst = getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); if (!Cst || Cst->Value.getZExtValue() >= SrcTy.getNumElements()) return false; @@ -3917,7 +3829,7 @@ bool CombinerHelper::matchExtractAllEltsFromBuildVector( MRI.use_instr_nodbg_end())) { if (II.getOpcode() != TargetOpcode::G_EXTRACT_VECTOR_ELT) return false; - auto Cst = getConstantVRegVal(II.getOperand(2).getReg(), MRI); + auto Cst = getIConstantVRegVal(II.getOperand(2).getReg(), MRI); if (!Cst) return false; unsigned Idx = Cst.getValue().getZExtValue(); @@ -4064,6 +3976,78 @@ bool CombinerHelper::matchICmpToTrueFalseKnownBits(MachineInstr &MI, return true; } +bool CombinerHelper::matchICmpToLHSKnownBits( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_ICMP); + // Given: + // + // %x = G_WHATEVER (... x is known to be 0 or 1 ...) + // %cmp = G_ICMP ne %x, 0 + // + // Or: + // + // %x = G_WHATEVER (... x is known to be 0 or 1 ...) + // %cmp = G_ICMP eq %x, 1 + // + // We can replace %cmp with %x assuming true is 1 on the target. + auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); + if (!CmpInst::isEquality(Pred)) + return false; + Register Dst = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(Dst); + if (getICmpTrueVal(getTargetLowering(), DstTy.isVector(), + /* IsFP = */ false) != 1) + return false; + int64_t OneOrZero = Pred == CmpInst::ICMP_EQ; + if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICst(OneOrZero))) + return false; + Register LHS = MI.getOperand(2).getReg(); + auto KnownLHS = KB->getKnownBits(LHS); + if (KnownLHS.getMinValue() != 0 || KnownLHS.getMaxValue() != 1) + return false; + // Make sure replacing Dst with the LHS is a legal operation. + LLT LHSTy = MRI.getType(LHS); + unsigned LHSSize = LHSTy.getSizeInBits(); + unsigned DstSize = DstTy.getSizeInBits(); + unsigned Op = TargetOpcode::COPY; + if (DstSize != LHSSize) + Op = DstSize < LHSSize ? TargetOpcode::G_TRUNC : TargetOpcode::G_ZEXT; + if (!isLegalOrBeforeLegalizer({Op, {DstTy, LHSTy}})) + return false; + MatchInfo = [=](MachineIRBuilder &B) { B.buildInstr(Op, {Dst}, {LHS}); }; + return true; +} + +// Replace (and (or x, c1), c2) with (and x, c2) iff c1 & c2 == 0 +bool CombinerHelper::matchAndOrDisjointMask( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_AND); + + // Ignore vector types to simplify matching the two constants. + // TODO: do this for vectors and scalars via a demanded bits analysis. + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + if (Ty.isVector()) + return false; + + Register Src; + int64_t MaskAnd; + int64_t MaskOr; + if (!mi_match(MI, MRI, + m_GAnd(m_GOr(m_Reg(Src), m_ICst(MaskOr)), m_ICst(MaskAnd)))) + return false; + + // Check if MaskOr could turn on any bits in Src. + if (MaskAnd & MaskOr) + return false; + + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Observer.changingInstr(MI); + MI.getOperand(1).setReg(Src); + Observer.changedInstr(MI); + }; + return true; +} + /// Form a G_SBFX from a G_SEXT_INREG fed by a right shift. bool CombinerHelper::matchBitfieldExtractFromSExtInReg( MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { @@ -4130,6 +4114,104 @@ bool CombinerHelper::matchBitfieldExtractFromAnd( return true; } +bool CombinerHelper::matchBitfieldExtractFromShr( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + const unsigned Opcode = MI.getOpcode(); + assert(Opcode == TargetOpcode::G_ASHR || Opcode == TargetOpcode::G_LSHR); + + const Register Dst = MI.getOperand(0).getReg(); + + const unsigned ExtrOpcode = Opcode == TargetOpcode::G_ASHR + ? TargetOpcode::G_SBFX + : TargetOpcode::G_UBFX; + + // Check if the type we would use for the extract is legal + LLT Ty = MRI.getType(Dst); + LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + if (!LI || !LI->isLegalOrCustom({ExtrOpcode, {Ty, ExtractTy}})) + return false; + + Register ShlSrc; + int64_t ShrAmt; + int64_t ShlAmt; + const unsigned Size = Ty.getScalarSizeInBits(); + + // Try to match shr (shl x, c1), c2 + if (!mi_match(Dst, MRI, + m_BinOp(Opcode, + m_OneNonDBGUse(m_GShl(m_Reg(ShlSrc), m_ICst(ShlAmt))), + m_ICst(ShrAmt)))) + return false; + + // Make sure that the shift sizes can fit a bitfield extract + if (ShlAmt < 0 || ShlAmt > ShrAmt || ShrAmt >= Size) + return false; + + // Skip this combine if the G_SEXT_INREG combine could handle it + if (Opcode == TargetOpcode::G_ASHR && ShlAmt == ShrAmt) + return false; + + // Calculate start position and width of the extract + const int64_t Pos = ShrAmt - ShlAmt; + const int64_t Width = Size - ShrAmt; + + MatchInfo = [=](MachineIRBuilder &B) { + auto WidthCst = B.buildConstant(ExtractTy, Width); + auto PosCst = B.buildConstant(ExtractTy, Pos); + B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst}); + }; + return true; +} + +bool CombinerHelper::matchBitfieldExtractFromShrAnd( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + const unsigned Opcode = MI.getOpcode(); + assert(Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_ASHR); + + const Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + if (!getTargetLowering().isConstantUnsignedBitfieldExtactLegal( + TargetOpcode::G_UBFX, Ty, Ty)) + return false; + + // Try to match shr (and x, c1), c2 + Register AndSrc; + int64_t ShrAmt; + int64_t SMask; + if (!mi_match(Dst, MRI, + m_BinOp(Opcode, + m_OneNonDBGUse(m_GAnd(m_Reg(AndSrc), m_ICst(SMask))), + m_ICst(ShrAmt)))) + return false; + + const unsigned Size = Ty.getScalarSizeInBits(); + if (ShrAmt < 0 || ShrAmt >= Size) + return false; + + // Check that ubfx can do the extraction, with no holes in the mask. + uint64_t UMask = SMask; + UMask |= maskTrailingOnes<uint64_t>(ShrAmt); + UMask &= maskTrailingOnes<uint64_t>(Size); + if (!isMask_64(UMask)) + return false; + + // Calculate start position and width of the extract. + const int64_t Pos = ShrAmt; + const int64_t Width = countTrailingOnes(UMask) - ShrAmt; + + // It's preferable to keep the shift, rather than form G_SBFX. + // TODO: remove the G_AND via demanded bits analysis. + if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + auto WidthCst = B.buildConstant(Ty, Width); + auto PosCst = B.buildConstant(Ty, Pos); + B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst}); + }; + return true; +} + bool CombinerHelper::reassociationCanBreakAddressingModePattern( MachineInstr &PtrAdd) { assert(PtrAdd.getOpcode() == TargetOpcode::G_PTR_ADD); @@ -4144,10 +4226,10 @@ bool CombinerHelper::reassociationCanBreakAddressingModePattern( if (MRI.hasOneNonDBGUse(Src1Reg)) return false; - auto C1 = getConstantVRegVal(Src1Def->getOperand(2).getReg(), MRI); + auto C1 = getIConstantVRegVal(Src1Def->getOperand(2).getReg(), MRI); if (!C1) return false; - auto C2 = getConstantVRegVal(Src2Reg, MRI); + auto C2 = getIConstantVRegVal(Src2Reg, MRI); if (!C2) return false; @@ -4198,9 +4280,91 @@ bool CombinerHelper::reassociationCanBreakAddressingModePattern( return false; } -bool CombinerHelper::matchReassocPtrAdd( - MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD); +bool CombinerHelper::matchReassocConstantInnerRHS(GPtrAdd &MI, + MachineInstr *RHS, + BuildFnTy &MatchInfo) { + // G_PTR_ADD(BASE, G_ADD(X, C)) -> G_PTR_ADD(G_PTR_ADD(BASE, X), C) + Register Src1Reg = MI.getOperand(1).getReg(); + if (RHS->getOpcode() != TargetOpcode::G_ADD) + return false; + auto C2 = getIConstantVRegVal(RHS->getOperand(2).getReg(), MRI); + if (!C2) + return false; + + MatchInfo = [=, &MI](MachineIRBuilder &B) { + LLT PtrTy = MRI.getType(MI.getOperand(0).getReg()); + + auto NewBase = + Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg()); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(NewBase.getReg(0)); + MI.getOperand(2).setReg(RHS->getOperand(2).getReg()); + Observer.changedInstr(MI); + }; + return !reassociationCanBreakAddressingModePattern(MI); +} + +bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI, + MachineInstr *LHS, + MachineInstr *RHS, + BuildFnTy &MatchInfo) { + // G_PTR_ADD (G_PTR_ADD X, C), Y) -> (G_PTR_ADD (G_PTR_ADD(X, Y), C) + // if and only if (G_PTR_ADD X, C) has one use. + Register LHSBase; + Optional<ValueAndVReg> LHSCstOff; + if (!mi_match(MI.getBaseReg(), MRI, + m_OneNonDBGUse(m_GPtrAdd(m_Reg(LHSBase), m_GCst(LHSCstOff))))) + return false; + + auto *LHSPtrAdd = cast<GPtrAdd>(LHS); + MatchInfo = [=, &MI](MachineIRBuilder &B) { + // When we change LHSPtrAdd's offset register we might cause it to use a reg + // before its def. Sink the instruction so the outer PTR_ADD to ensure this + // doesn't happen. + LHSPtrAdd->moveBefore(&MI); + Register RHSReg = MI.getOffsetReg(); + Observer.changingInstr(MI); + MI.getOperand(2).setReg(LHSCstOff->VReg); + Observer.changedInstr(MI); + Observer.changingInstr(*LHSPtrAdd); + LHSPtrAdd->getOperand(2).setReg(RHSReg); + Observer.changedInstr(*LHSPtrAdd); + }; + return !reassociationCanBreakAddressingModePattern(MI); +} + +bool CombinerHelper::matchReassocFoldConstantsInSubTree(GPtrAdd &MI, + MachineInstr *LHS, + MachineInstr *RHS, + BuildFnTy &MatchInfo) { + // G_PTR_ADD(G_PTR_ADD(BASE, C1), C2) -> G_PTR_ADD(BASE, C1+C2) + auto *LHSPtrAdd = dyn_cast<GPtrAdd>(LHS); + if (!LHSPtrAdd) + return false; + + Register Src2Reg = MI.getOperand(2).getReg(); + Register LHSSrc1 = LHSPtrAdd->getBaseReg(); + Register LHSSrc2 = LHSPtrAdd->getOffsetReg(); + auto C1 = getIConstantVRegVal(LHSSrc2, MRI); + if (!C1) + return false; + auto C2 = getIConstantVRegVal(Src2Reg, MRI); + if (!C2) + return false; + + MatchInfo = [=, &MI](MachineIRBuilder &B) { + auto NewCst = B.buildConstant(MRI.getType(Src2Reg), *C1 + *C2); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(LHSSrc1); + MI.getOperand(2).setReg(NewCst.getReg(0)); + Observer.changedInstr(MI); + }; + return !reassociationCanBreakAddressingModePattern(MI); +} + +bool CombinerHelper::matchReassocPtrAdd(MachineInstr &MI, + BuildFnTy &MatchInfo) { + auto &PtrAdd = cast<GPtrAdd>(MI); // We're trying to match a few pointer computation patterns here for // re-association opportunities. // 1) Isolating a constant operand to be on the RHS, e.g.: @@ -4209,49 +4373,26 @@ bool CombinerHelper::matchReassocPtrAdd( // 2) Folding two constants in each sub-tree as long as such folding // doesn't break a legal addressing mode. // G_PTR_ADD(G_PTR_ADD(BASE, C1), C2) -> G_PTR_ADD(BASE, C1+C2) - Register Src1Reg = MI.getOperand(1).getReg(); - Register Src2Reg = MI.getOperand(2).getReg(); - MachineInstr *LHS = MRI.getVRegDef(Src1Reg); - MachineInstr *RHS = MRI.getVRegDef(Src2Reg); + // + // 3) Move a constant from the LHS of an inner op to the RHS of the outer. + // G_PTR_ADD (G_PTR_ADD X, C), Y) -> G_PTR_ADD (G_PTR_ADD(X, Y), C) + // iif (G_PTR_ADD X, C) has one use. + MachineInstr *LHS = MRI.getVRegDef(PtrAdd.getBaseReg()); + MachineInstr *RHS = MRI.getVRegDef(PtrAdd.getOffsetReg()); - if (LHS->getOpcode() != TargetOpcode::G_PTR_ADD) { - // Try to match example 1). - if (RHS->getOpcode() != TargetOpcode::G_ADD) - return false; - auto C2 = getConstantVRegVal(RHS->getOperand(2).getReg(), MRI); - if (!C2) - return false; + // Try to match example 2. + if (matchReassocFoldConstantsInSubTree(PtrAdd, LHS, RHS, MatchInfo)) + return true; - MatchInfo = [=,&MI](MachineIRBuilder &B) { - LLT PtrTy = MRI.getType(MI.getOperand(0).getReg()); + // Try to match example 3. + if (matchReassocConstantInnerLHS(PtrAdd, LHS, RHS, MatchInfo)) + return true; - auto NewBase = - Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg()); - Observer.changingInstr(MI); - MI.getOperand(1).setReg(NewBase.getReg(0)); - MI.getOperand(2).setReg(RHS->getOperand(2).getReg()); - Observer.changedInstr(MI); - }; - } else { - // Try to match example 2. - Register LHSSrc1 = LHS->getOperand(1).getReg(); - Register LHSSrc2 = LHS->getOperand(2).getReg(); - auto C1 = getConstantVRegVal(LHSSrc2, MRI); - if (!C1) - return false; - auto C2 = getConstantVRegVal(Src2Reg, MRI); - if (!C2) - return false; + // Try to match example 1. + if (matchReassocConstantInnerRHS(PtrAdd, RHS, MatchInfo)) + return true; - MatchInfo = [=, &MI](MachineIRBuilder &B) { - auto NewCst = B.buildConstant(MRI.getType(Src2Reg), *C1 + *C2); - Observer.changingInstr(MI); - MI.getOperand(1).setReg(LHSSrc1); - MI.getOperand(2).setReg(NewCst.getReg(0)); - Observer.changedInstr(MI); - }; - } - return !reassociationCanBreakAddressingModePattern(MI); + return false; } bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) { @@ -4264,6 +4405,361 @@ bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) { return true; } +bool CombinerHelper::matchNarrowBinopFeedingAnd( + MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { + // Look for a binop feeding into an AND with a mask: + // + // %add = G_ADD %lhs, %rhs + // %and = G_AND %add, 000...11111111 + // + // Check if it's possible to perform the binop at a narrower width and zext + // back to the original width like so: + // + // %narrow_lhs = G_TRUNC %lhs + // %narrow_rhs = G_TRUNC %rhs + // %narrow_add = G_ADD %narrow_lhs, %narrow_rhs + // %new_add = G_ZEXT %narrow_add + // %and = G_AND %new_add, 000...11111111 + // + // This can allow later combines to eliminate the G_AND if it turns out + // that the mask is irrelevant. + assert(MI.getOpcode() == TargetOpcode::G_AND); + Register Dst = MI.getOperand(0).getReg(); + Register AndLHS = MI.getOperand(1).getReg(); + Register AndRHS = MI.getOperand(2).getReg(); + LLT WideTy = MRI.getType(Dst); + + // If the potential binop has more than one use, then it's possible that one + // of those uses will need its full width. + if (!WideTy.isScalar() || !MRI.hasOneNonDBGUse(AndLHS)) + return false; + + // Check if the LHS feeding the AND is impacted by the high bits that we're + // masking out. + // + // e.g. for 64-bit x, y: + // + // add_64(x, y) & 65535 == zext(add_16(trunc(x), trunc(y))) & 65535 + MachineInstr *LHSInst = getDefIgnoringCopies(AndLHS, MRI); + if (!LHSInst) + return false; + unsigned LHSOpc = LHSInst->getOpcode(); + switch (LHSOpc) { + default: + return false; + case TargetOpcode::G_ADD: + case TargetOpcode::G_SUB: + case TargetOpcode::G_MUL: + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: + break; + } + + // Find the mask on the RHS. + auto Cst = getIConstantVRegValWithLookThrough(AndRHS, MRI); + if (!Cst) + return false; + auto Mask = Cst->Value; + if (!Mask.isMask()) + return false; + + // No point in combining if there's nothing to truncate. + unsigned NarrowWidth = Mask.countTrailingOnes(); + if (NarrowWidth == WideTy.getSizeInBits()) + return false; + LLT NarrowTy = LLT::scalar(NarrowWidth); + + // Check if adding the zext + truncates could be harmful. + auto &MF = *MI.getMF(); + const auto &TLI = getTargetLowering(); + LLVMContext &Ctx = MF.getFunction().getContext(); + auto &DL = MF.getDataLayout(); + if (!TLI.isTruncateFree(WideTy, NarrowTy, DL, Ctx) || + !TLI.isZExtFree(NarrowTy, WideTy, DL, Ctx)) + return false; + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {NarrowTy, WideTy}}) || + !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {WideTy, NarrowTy}})) + return false; + Register BinOpLHS = LHSInst->getOperand(1).getReg(); + Register BinOpRHS = LHSInst->getOperand(2).getReg(); + MatchInfo = [=, &MI](MachineIRBuilder &B) { + auto NarrowLHS = Builder.buildTrunc(NarrowTy, BinOpLHS); + auto NarrowRHS = Builder.buildTrunc(NarrowTy, BinOpRHS); + auto NarrowBinOp = + Builder.buildInstr(LHSOpc, {NarrowTy}, {NarrowLHS, NarrowRHS}); + auto Ext = Builder.buildZExt(WideTy, NarrowBinOp); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(Ext.getReg(0)); + Observer.changedInstr(MI); + }; + return true; +} + +bool CombinerHelper::matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) { + unsigned Opc = MI.getOpcode(); + assert(Opc == TargetOpcode::G_UMULO || Opc == TargetOpcode::G_SMULO); + // Check for a constant 2 or a splat of 2 on the RHS. + auto RHS = MI.getOperand(3).getReg(); + bool IsVector = MRI.getType(RHS).isVector(); + if (!IsVector && !mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICst(2))) + return false; + if (IsVector) { + // FIXME: There's no mi_match pattern for this yet. + auto *RHSDef = getDefIgnoringCopies(RHS, MRI); + if (!RHSDef) + return false; + auto Splat = getBuildVectorConstantSplat(*RHSDef, MRI); + if (!Splat || *Splat != 2) + return false; + } + + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Observer.changingInstr(MI); + unsigned NewOpc = Opc == TargetOpcode::G_UMULO ? TargetOpcode::G_UADDO + : TargetOpcode::G_SADDO; + MI.setDesc(Builder.getTII().get(NewOpc)); + MI.getOperand(3).setReg(MI.getOperand(2).getReg()); + Observer.changedInstr(MI); + }; + return true; +} + +MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_UDIV); + auto &UDiv = cast<GenericMachineInstr>(MI); + Register Dst = UDiv.getReg(0); + Register LHS = UDiv.getReg(1); + Register RHS = UDiv.getReg(2); + LLT Ty = MRI.getType(Dst); + LLT ScalarTy = Ty.getScalarType(); + const unsigned EltBits = ScalarTy.getScalarSizeInBits(); + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType(); + auto &MIB = Builder; + MIB.setInstrAndDebugLoc(MI); + + bool UseNPQ = false; + SmallVector<Register, 16> PreShifts, PostShifts, MagicFactors, NPQFactors; + + auto BuildUDIVPattern = [&](const Constant *C) { + auto *CI = cast<ConstantInt>(C); + const APInt &Divisor = CI->getValue(); + UnsignedDivisonByConstantInfo magics = + UnsignedDivisonByConstantInfo::get(Divisor); + unsigned PreShift = 0, PostShift = 0; + + // If the divisor is even, we can avoid using the expensive fixup by + // shifting the divided value upfront. + if (magics.IsAdd != 0 && !Divisor[0]) { + PreShift = Divisor.countTrailingZeros(); + // Get magic number for the shifted divisor. + magics = + UnsignedDivisonByConstantInfo::get(Divisor.lshr(PreShift), PreShift); + assert(magics.IsAdd == 0 && "Should use cheap fixup now"); + } + + APInt Magic = magics.Magic; + + unsigned SelNPQ; + if (magics.IsAdd == 0 || Divisor.isOneValue()) { + assert(magics.ShiftAmount < Divisor.getBitWidth() && + "We shouldn't generate an undefined shift!"); + PostShift = magics.ShiftAmount; + SelNPQ = false; + } else { + PostShift = magics.ShiftAmount - 1; + SelNPQ = true; + } + + PreShifts.push_back( + MIB.buildConstant(ScalarShiftAmtTy, PreShift).getReg(0)); + MagicFactors.push_back(MIB.buildConstant(ScalarTy, Magic).getReg(0)); + NPQFactors.push_back( + MIB.buildConstant(ScalarTy, + SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1) + : APInt::getZero(EltBits)) + .getReg(0)); + PostShifts.push_back( + MIB.buildConstant(ScalarShiftAmtTy, PostShift).getReg(0)); + UseNPQ |= SelNPQ; + return true; + }; + + // Collect the shifts/magic values from each element. + bool Matched = matchUnaryPredicate(MRI, RHS, BuildUDIVPattern); + (void)Matched; + assert(Matched && "Expected unary predicate match to succeed"); + + Register PreShift, PostShift, MagicFactor, NPQFactor; + auto *RHSDef = getOpcodeDef<GBuildVector>(RHS, MRI); + if (RHSDef) { + PreShift = MIB.buildBuildVector(ShiftAmtTy, PreShifts).getReg(0); + MagicFactor = MIB.buildBuildVector(Ty, MagicFactors).getReg(0); + NPQFactor = MIB.buildBuildVector(Ty, NPQFactors).getReg(0); + PostShift = MIB.buildBuildVector(ShiftAmtTy, PostShifts).getReg(0); + } else { + assert(MRI.getType(RHS).isScalar() && + "Non-build_vector operation should have been a scalar"); + PreShift = PreShifts[0]; + MagicFactor = MagicFactors[0]; + PostShift = PostShifts[0]; + } + + Register Q = LHS; + Q = MIB.buildLShr(Ty, Q, PreShift).getReg(0); + + // Multiply the numerator (operand 0) by the magic value. + Q = MIB.buildUMulH(Ty, Q, MagicFactor).getReg(0); + + if (UseNPQ) { + Register NPQ = MIB.buildSub(Ty, LHS, Q).getReg(0); + + // For vectors we might have a mix of non-NPQ/NPQ paths, so use + // G_UMULH to act as a SRL-by-1 for NPQ, else multiply by zero. + if (Ty.isVector()) + NPQ = MIB.buildUMulH(Ty, NPQ, NPQFactor).getReg(0); + else + NPQ = MIB.buildLShr(Ty, NPQ, MIB.buildConstant(ShiftAmtTy, 1)).getReg(0); + + Q = MIB.buildAdd(Ty, NPQ, Q).getReg(0); + } + + Q = MIB.buildLShr(Ty, Q, PostShift).getReg(0); + auto One = MIB.buildConstant(Ty, 1); + auto IsOne = MIB.buildICmp( + CmpInst::Predicate::ICMP_EQ, + Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One); + return MIB.buildSelect(Ty, IsOne, LHS, Q); +} + +bool CombinerHelper::matchUDivByConst(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_UDIV); + Register Dst = MI.getOperand(0).getReg(); + Register RHS = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(Dst); + auto *RHSDef = MRI.getVRegDef(RHS); + if (!isConstantOrConstantVector(*RHSDef, MRI)) + return false; + + auto &MF = *MI.getMF(); + AttributeList Attr = MF.getFunction().getAttributes(); + const auto &TLI = getTargetLowering(); + LLVMContext &Ctx = MF.getFunction().getContext(); + auto &DL = MF.getDataLayout(); + if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, DL, Ctx), Attr)) + return false; + + // Don't do this for minsize because the instruction sequence is usually + // larger. + if (MF.getFunction().hasMinSize()) + return false; + + // Don't do this if the types are not going to be legal. + if (LI) { + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}})) + return false; + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_UMULH, {DstTy}})) + return false; + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_ICMP, + {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1), + DstTy}})) + return false; + } + + auto CheckEltValue = [&](const Constant *C) { + if (auto *CI = dyn_cast_or_null<ConstantInt>(C)) + return !CI->isZero(); + return false; + }; + return matchUnaryPredicate(MRI, RHS, CheckEltValue); +} + +void CombinerHelper::applyUDivByConst(MachineInstr &MI) { + auto *NewMI = buildUDivUsingMul(MI); + replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); +} + +bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_UMULH); + Register RHS = MI.getOperand(2).getReg(); + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + auto MatchPow2ExceptOne = [&](const Constant *C) { + if (auto *CI = dyn_cast<ConstantInt>(C)) + return CI->getValue().isPowerOf2() && !CI->getValue().isOne(); + return false; + }; + if (!matchUnaryPredicate(MRI, RHS, MatchPow2ExceptOne, false)) + return false; + return isLegalOrBeforeLegalizer({TargetOpcode::G_LSHR, {Ty, ShiftAmtTy}}); +} + +void CombinerHelper::applyUMulHToLShr(MachineInstr &MI) { + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + unsigned NumEltBits = Ty.getScalarSizeInBits(); + + Builder.setInstrAndDebugLoc(MI); + auto LogBase2 = buildLogBase2(RHS, Builder); + auto ShiftAmt = + Builder.buildSub(Ty, Builder.buildConstant(Ty, NumEltBits), LogBase2); + auto Trunc = Builder.buildZExtOrTrunc(ShiftAmtTy, ShiftAmt); + Builder.buildLShr(Dst, LHS, Trunc); + MI.eraseFromParent(); +} + +bool CombinerHelper::matchRedundantNegOperands(MachineInstr &MI, + BuildFnTy &MatchInfo) { + unsigned Opc = MI.getOpcode(); + assert(Opc == TargetOpcode::G_FADD || Opc == TargetOpcode::G_FSUB || + Opc == TargetOpcode::G_FMUL || Opc == TargetOpcode::G_FDIV || + Opc == TargetOpcode::G_FMAD || Opc == TargetOpcode::G_FMA); + + Register Dst = MI.getOperand(0).getReg(); + Register X = MI.getOperand(1).getReg(); + Register Y = MI.getOperand(2).getReg(); + LLT Type = MRI.getType(Dst); + + // fold (fadd x, fneg(y)) -> (fsub x, y) + // fold (fadd fneg(y), x) -> (fsub x, y) + // G_ADD is commutative so both cases are checked by m_GFAdd + if (mi_match(Dst, MRI, m_GFAdd(m_Reg(X), m_GFNeg(m_Reg(Y)))) && + isLegalOrBeforeLegalizer({TargetOpcode::G_FSUB, {Type}})) { + Opc = TargetOpcode::G_FSUB; + } + /// fold (fsub x, fneg(y)) -> (fadd x, y) + else if (mi_match(Dst, MRI, m_GFSub(m_Reg(X), m_GFNeg(m_Reg(Y)))) && + isLegalOrBeforeLegalizer({TargetOpcode::G_FADD, {Type}})) { + Opc = TargetOpcode::G_FADD; + } + // fold (fmul fneg(x), fneg(y)) -> (fmul x, y) + // fold (fdiv fneg(x), fneg(y)) -> (fdiv x, y) + // fold (fmad fneg(x), fneg(y), z) -> (fmad x, y, z) + // fold (fma fneg(x), fneg(y), z) -> (fma x, y, z) + else if ((Opc == TargetOpcode::G_FMUL || Opc == TargetOpcode::G_FDIV || + Opc == TargetOpcode::G_FMAD || Opc == TargetOpcode::G_FMA) && + mi_match(X, MRI, m_GFNeg(m_Reg(X))) && + mi_match(Y, MRI, m_GFNeg(m_Reg(Y)))) { + // no opcode change + } else + return false; + + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Observer.changingInstr(MI); + MI.setDesc(B.getTII().get(Opc)); + MI.getOperand(1).setReg(X); + MI.getOperand(2).setReg(Y); + Observer.changedInstr(MI); + }; + return true; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; |
