diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 1595 |
1 files changed, 1240 insertions, 355 deletions
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 244e7a9583d6..e7f40523efaf 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -16,6 +16,7 @@ #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -29,6 +30,7 @@ using namespace llvm; using namespace LegalizeActions; +using namespace MIPatternMatch; /// Try to break down \p OrigTy into \p NarrowTy sized pieces. /// @@ -75,6 +77,8 @@ static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) { return Type::getFloatTy(Ctx); case 64: return Type::getDoubleTy(Ctx); + case 80: + return Type::getX86_FP80Ty(Ctx); case 128: return Type::getFP128Ty(Ctx); default: @@ -86,16 +90,15 @@ LegalizerHelper::LegalizerHelper(MachineFunction &MF, GISelChangeObserver &Observer, MachineIRBuilder &Builder) : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()), - LI(*MF.getSubtarget().getLegalizerInfo()) { - MIRBuilder.setChangeObserver(Observer); -} + LI(*MF.getSubtarget().getLegalizerInfo()), + TLI(*MF.getSubtarget().getTargetLowering()) { } LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI, GISelChangeObserver &Observer, MachineIRBuilder &B) - : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI) { - MIRBuilder.setChangeObserver(Observer); -} + : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI), + TLI(*MF.getSubtarget().getTargetLowering()) { } + LegalizerHelper::LegalizeResult LegalizerHelper::legalizeInstrStep(MachineInstr &MI) { LLVM_DEBUG(dbgs() << "Legalizing: " << MI); @@ -237,22 +240,21 @@ void LegalizerHelper::insertParts(Register DstReg, } } -/// Return the result registers of G_UNMERGE_VALUES \p MI in \p Regs +/// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs. static void getUnmergeResults(SmallVectorImpl<Register> &Regs, const MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES); + const int StartIdx = Regs.size(); const int NumResults = MI.getNumOperands() - 1; - Regs.resize(NumResults); + Regs.resize(Regs.size() + NumResults); for (int I = 0; I != NumResults; ++I) - Regs[I] = MI.getOperand(I).getReg(); + Regs[StartIdx + I] = MI.getOperand(I).getReg(); } -LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy, - LLT NarrowTy, Register SrcReg) { +void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, + LLT GCDTy, Register SrcReg) { LLT SrcTy = MRI.getType(SrcReg); - - LLT GCDTy = getGCDType(DstTy, getGCDType(SrcTy, NarrowTy)); if (SrcTy == GCDTy) { // If the source already evenly divides the result type, we don't need to do // anything. @@ -262,7 +264,13 @@ LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy, auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg); getUnmergeResults(Parts, *Unmerge); } +} +LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy, + LLT NarrowTy, Register SrcReg) { + LLT SrcTy = MRI.getType(SrcReg); + LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy); + extractGCDType(Parts, GCDTy, SrcReg); return GCDTy; } @@ -376,7 +384,14 @@ void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy, } if (LCMTy.isVector()) { - MIRBuilder.buildExtract(DstReg, Remerge, 0); + unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits(); + SmallVector<Register, 8> UnmergeDefs(NumDefs); + UnmergeDefs[0] = DstReg; + for (unsigned I = 1; I != NumDefs; ++I) + UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy); + + MIRBuilder.buildUnmerge(UnmergeDefs, + MIRBuilder.buildMerge(LCMTy, RemergeRegs)); return; } @@ -384,7 +399,7 @@ void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy, } static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { -#define RTLIBCASE(LibcallPrefix) \ +#define RTLIBCASE_INT(LibcallPrefix) \ do { \ switch (Size) { \ case 32: \ @@ -398,19 +413,33 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { } \ } while (0) - assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size"); +#define RTLIBCASE(LibcallPrefix) \ + do { \ + switch (Size) { \ + case 32: \ + return RTLIB::LibcallPrefix##32; \ + case 64: \ + return RTLIB::LibcallPrefix##64; \ + case 80: \ + return RTLIB::LibcallPrefix##80; \ + case 128: \ + return RTLIB::LibcallPrefix##128; \ + default: \ + llvm_unreachable("unexpected size"); \ + } \ + } while (0) switch (Opcode) { case TargetOpcode::G_SDIV: - RTLIBCASE(SDIV_I); + RTLIBCASE_INT(SDIV_I); case TargetOpcode::G_UDIV: - RTLIBCASE(UDIV_I); + RTLIBCASE_INT(UDIV_I); case TargetOpcode::G_SREM: - RTLIBCASE(SREM_I); + RTLIBCASE_INT(SREM_I); case TargetOpcode::G_UREM: - RTLIBCASE(UREM_I); + RTLIBCASE_INT(UREM_I); case TargetOpcode::G_CTLZ_ZERO_UNDEF: - RTLIBCASE(CTLZ_I); + RTLIBCASE_INT(CTLZ_I); case TargetOpcode::G_FADD: RTLIBCASE(ADD_F); case TargetOpcode::G_FSUB: @@ -453,13 +482,16 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { RTLIBCASE(RINT_F); case TargetOpcode::G_FNEARBYINT: RTLIBCASE(NEARBYINT_F); + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: + RTLIBCASE(ROUNDEVEN_F); } llvm_unreachable("Unknown libcall function"); } /// True if an instruction is in tail position in its caller. Intended for /// legalizing libcalls as tail calls when possible. -static bool isLibCallInTailPosition(MachineInstr &MI) { +static bool isLibCallInTailPosition(const TargetInstrInfo &TII, + MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const Function &F = MBB.getParent()->getFunction(); @@ -479,7 +511,6 @@ static bool isLibCallInTailPosition(MachineInstr &MI) { return false; // Only tail call if the following instruction is a standard return. - auto &TII = *MI.getMF()->getSubtarget().getInstrInfo(); auto Next = next_nodbg(MI.getIterator(), MBB.instr_end()); if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn()) return false; @@ -531,12 +562,11 @@ simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, LegalizerHelper::LegalizeResult llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, MachineInstr &MI) { - assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); SmallVector<CallLowering::ArgInfo, 3> Args; // Add all the args, except for the last which is an imm denoting 'tail'. - for (unsigned i = 1; i < MI.getNumOperands() - 1; i++) { + for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) { Register Reg = MI.getOperand(i).getReg(); // Need derive an IR type for call lowering. @@ -551,31 +581,28 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); - Intrinsic::ID ID = MI.getOperand(0).getIntrinsicID(); RTLIB::Libcall RTLibcall; - switch (ID) { - case Intrinsic::memcpy: + switch (MI.getOpcode()) { + case TargetOpcode::G_MEMCPY: RTLibcall = RTLIB::MEMCPY; break; - case Intrinsic::memset: - RTLibcall = RTLIB::MEMSET; - break; - case Intrinsic::memmove: + case TargetOpcode::G_MEMMOVE: RTLibcall = RTLIB::MEMMOVE; break; + case TargetOpcode::G_MEMSET: + RTLibcall = RTLIB::MEMSET; + break; default: return LegalizerHelper::UnableToLegalize; } const char *Name = TLI.getLibcallName(RTLibcall); - MIRBuilder.setInstrAndDebugLoc(MI); - CallLowering::CallLoweringInfo Info; Info.CallConv = TLI.getLibcallCallingConv(RTLibcall); Info.Callee = MachineOperand::CreateES(Name); Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx)); - Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() == 1 && - isLibCallInTailPosition(MI); + Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() && + isLibCallInTailPosition(MIRBuilder.getTII(), MI); std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs)); if (!CLI.lowerCall(MIRBuilder, Info)) @@ -668,10 +695,11 @@ LegalizerHelper::libcall(MachineInstr &MI) { case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FSQRT: case TargetOpcode::G_FRINT: - case TargetOpcode::G_FNEARBYINT: { + case TargetOpcode::G_FNEARBYINT: + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { Type *HLTy = getFloatTypeForLLT(Ctx, LLTy); - if (!HLTy || (Size != 32 && Size != 64 && Size != 128)) { - LLVM_DEBUG(dbgs() << "No libcall available for size " << Size << ".\n"); + if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) { + LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n"); return UnableToLegalize; } auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy); @@ -720,6 +748,13 @@ LegalizerHelper::libcall(MachineInstr &MI) { return Status; break; } + case TargetOpcode::G_MEMCPY: + case TargetOpcode::G_MEMMOVE: + case TargetOpcode::G_MEMSET: { + LegalizeResult Result = createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI); + MI.eraseFromParent(); + return Result; + } } MI.eraseFromParent(); @@ -900,7 +935,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, case TargetOpcode::G_INSERT: return narrowScalarInsert(MI, TypeIdx, NarrowTy); case TargetOpcode::G_LOAD: { - const auto &MMO = **MI.memoperands_begin(); + auto &MMO = **MI.memoperands_begin(); Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); if (DstTy.isVector()) @@ -908,7 +943,6 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, if (8 * MMO.getSize() != DstTy.getSizeInBits()) { Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); - auto &MMO = **MI.memoperands_begin(); MIRBuilder.buildLoad(TmpReg, MI.getOperand(1), MMO); MIRBuilder.buildAnyExt(DstReg, TmpReg); MI.eraseFromParent(); @@ -925,10 +959,15 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); auto &MMO = **MI.memoperands_begin(); - if (MMO.getSizeInBits() == NarrowSize) { + unsigned MemSize = MMO.getSizeInBits(); + + if (MemSize == NarrowSize) { MIRBuilder.buildLoad(TmpReg, PtrReg, MMO); - } else { + } else if (MemSize < NarrowSize) { MIRBuilder.buildLoadInstr(MI.getOpcode(), TmpReg, PtrReg, MMO); + } else if (MemSize > NarrowSize) { + // FIXME: Need to split the load. + return UnableToLegalize; } if (ZExt) @@ -1204,6 +1243,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, MI.eraseFromParent(); return Legalized; } + case TargetOpcode::G_PTR_ADD: case TargetOpcode::G_PTRMASK: { if (TypeIdx != 1) return UnableToLegalize; @@ -1212,6 +1252,29 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_FPTOUI: { + if (TypeIdx != 0) + return UnableToLegalize; + Observer.changingInstr(MI); + narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT); + Observer.changedInstr(MI); + return Legalized; + } + case TargetOpcode::G_FPTOSI: { + if (TypeIdx != 0) + return UnableToLegalize; + Observer.changingInstr(MI); + narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_SEXT); + Observer.changedInstr(MI); + return Legalized; + } + case TargetOpcode::G_FPEXT: + if (TypeIdx != 0) + return UnableToLegalize; + Observer.changingInstr(MI); + narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT); + Observer.changedInstr(MI); + return Legalized; } } @@ -1272,10 +1335,8 @@ void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy, void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx) { MachineOperand &MO = MI.getOperand(OpIdx); - Register DstExt = MRI.createGenericVirtualRegister(WideTy); MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); - MIRBuilder.buildExtract(MO, DstExt, 0); - MO.setReg(DstExt); + MO.setReg(widenWithUnmerge(WideTy, MO.getReg())); } void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, @@ -1443,6 +1504,40 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, return Legalized; } +Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) { + Register WideReg = MRI.createGenericVirtualRegister(WideTy); + LLT OrigTy = MRI.getType(OrigReg); + LLT LCMTy = getLCMType(WideTy, OrigTy); + + const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits(); + const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits(); + + Register UnmergeSrc = WideReg; + + // Create a merge to the LCM type, padding with undef + // %0:_(<3 x s32>) = G_FOO => <4 x s32> + // => + // %1:_(<4 x s32>) = G_FOO + // %2:_(<4 x s32>) = G_IMPLICIT_DEF + // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2 + // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3 + if (NumMergeParts > 1) { + Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0); + SmallVector<Register, 8> MergeParts(NumMergeParts, Undef); + MergeParts[0] = WideReg; + UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0); + } + + // Unmerge to the original register and pad with dead defs. + SmallVector<Register, 8> UnmergeResults(NumUnmergeParts); + UnmergeResults[0] = OrigReg; + for (int I = 1; I != NumUnmergeParts; ++I) + UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy); + + MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc); + return WideReg; +} + LegalizerHelper::LegalizeResult LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { @@ -1512,35 +1607,60 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc); - // Create a sequence of unmerges to the original results. since we may have - // widened the source, we will need to pad the results with dead defs to cover - // the source register. - // e.g. widen s16 to s32: - // %1:_(s16), %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0:_(s48) + // Create a sequence of unmerges and merges to the original results. Since we + // may have widened the source, we will need to pad the results with dead defs + // to cover the source register. + // e.g. widen s48 to s64: + // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96) // // => - // %4:_(s64) = G_ANYEXT %0:_(s48) - // %5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %4 ; Requested unmerge - // %1:_(s16), %2:_(s16) = G_UNMERGE_VALUES %5 ; unpack to original regs - // %3:_(s16), dead %7 = G_UNMERGE_VALUES %6 ; original reg + extra dead def - + // %4:_(s192) = G_ANYEXT %0:_(s96) + // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge + // ; unpack to GCD type, with extra dead defs + // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64) + // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64) + // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64) + // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination + // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination + const LLT GCDTy = getGCDType(WideTy, DstTy); const int NumUnmerge = Unmerge->getNumOperands() - 1; - const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits(); - - for (int I = 0; I != NumUnmerge; ++I) { - auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES); - - for (int J = 0; J != PartsPerUnmerge; ++J) { - int Idx = I * PartsPerUnmerge + J; - if (Idx < NumDst) - MIB.addDef(MI.getOperand(Idx).getReg()); - else { - // Create dead def for excess components. - MIB.addDef(MRI.createGenericVirtualRegister(DstTy)); + const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits(); + + // Directly unmerge to the destination without going through a GCD type + // if possible + if (PartsPerRemerge == 1) { + const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits(); + + for (int I = 0; I != NumUnmerge; ++I) { + auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES); + + for (int J = 0; J != PartsPerUnmerge; ++J) { + int Idx = I * PartsPerUnmerge + J; + if (Idx < NumDst) + MIB.addDef(MI.getOperand(Idx).getReg()); + else { + // Create dead def for excess components. + MIB.addDef(MRI.createGenericVirtualRegister(DstTy)); + } } + + MIB.addUse(Unmerge.getReg(I)); } + } else { + SmallVector<Register, 16> Parts; + for (int J = 0; J != NumUnmerge; ++J) + extractGCDType(Parts, GCDTy, Unmerge.getReg(J)); + + SmallVector<Register, 8> RemergeParts; + for (int I = 0; I != NumDst; ++I) { + for (int J = 0; J < PartsPerRemerge; ++J) { + const int Idx = I * PartsPerRemerge + J; + RemergeParts.emplace_back(Parts[Idx]); + } - MIB.addUse(Unmerge.getReg(I)); + MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts); + RemergeParts.clear(); + } } MI.eraseFromParent(); @@ -1590,8 +1710,7 @@ LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx, if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) { Src = MIRBuilder.buildAnyExt(WideTy, Src); ShiftTy = WideTy; - } else if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) - return UnableToLegalize; + } auto LShr = MIRBuilder.buildLShr( ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset)); @@ -1629,7 +1748,7 @@ LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx, LegalizerHelper::LegalizeResult LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { - if (TypeIdx != 0) + if (TypeIdx != 0 || WideTy.isVector()) return UnableToLegalize; Observer.changingInstr(MI); widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); @@ -1639,14 +1758,45 @@ LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx, } LegalizerHelper::LegalizeResult -LegalizerHelper::widenScalarAddSubSat(MachineInstr &MI, unsigned TypeIdx, - LLT WideTy) { +LegalizerHelper::widenScalarAddoSubo(MachineInstr &MI, unsigned TypeIdx, + LLT WideTy) { + if (TypeIdx == 1) + return UnableToLegalize; // TODO + unsigned Op = MI.getOpcode(); + unsigned Opcode = Op == TargetOpcode::G_UADDO || Op == TargetOpcode::G_SADDO + ? TargetOpcode::G_ADD + : TargetOpcode::G_SUB; + unsigned ExtOpcode = + Op == TargetOpcode::G_UADDO || Op == TargetOpcode::G_USUBO + ? TargetOpcode::G_ZEXT + : TargetOpcode::G_SEXT; + auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)}); + auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)}); + // Do the arithmetic in the larger type. + auto NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}); + LLT OrigTy = MRI.getType(MI.getOperand(0).getReg()); + auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp); + auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp}); + // There is no overflow if the ExtOp is the same as NewOp. + MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp); + // Now trunc the NewOp to the original result. + MIRBuilder.buildTrunc(MI.getOperand(0), NewOp); + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx, + LLT WideTy) { bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT || - MI.getOpcode() == TargetOpcode::G_SSUBSAT; + MI.getOpcode() == TargetOpcode::G_SSUBSAT || + MI.getOpcode() == TargetOpcode::G_SSHLSAT; + bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT || + MI.getOpcode() == TargetOpcode::G_USHLSAT; // We can convert this to: // 1. Any extend iN to iM // 2. SHL by M-N - // 3. [US][ADD|SUB]SAT + // 3. [US][ADD|SUB|SHL]SAT // 4. L/ASHR by M-N // // It may be more efficient to lower this to a min and a max operation in @@ -1657,11 +1807,14 @@ LegalizerHelper::widenScalarAddSubSat(MachineInstr &MI, unsigned TypeIdx, unsigned NewBits = WideTy.getScalarSizeInBits(); unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits(); + // Shifts must zero-extend the RHS to preserve the unsigned quantity, and + // must not left shift the RHS to preserve the shift amount. auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1)); - auto RHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2)); + auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2)) + : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2)); auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount); auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK); - auto ShiftR = MIRBuilder.buildShl(WideTy, RHS, ShiftK); + auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK); auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy}, {ShiftL, ShiftR}, MI.getFlags()); @@ -1689,34 +1842,18 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return widenScalarMergeValues(MI, TypeIdx, WideTy); case TargetOpcode::G_UNMERGE_VALUES: return widenScalarUnmergeValues(MI, TypeIdx, WideTy); + case TargetOpcode::G_SADDO: + case TargetOpcode::G_SSUBO: case TargetOpcode::G_UADDO: - case TargetOpcode::G_USUBO: { - if (TypeIdx == 1) - return UnableToLegalize; // TODO - auto LHSZext = MIRBuilder.buildZExt(WideTy, MI.getOperand(2)); - auto RHSZext = MIRBuilder.buildZExt(WideTy, MI.getOperand(3)); - unsigned Opcode = MI.getOpcode() == TargetOpcode::G_UADDO - ? TargetOpcode::G_ADD - : TargetOpcode::G_SUB; - // Do the arithmetic in the larger type. - auto NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSZext, RHSZext}); - LLT OrigTy = MRI.getType(MI.getOperand(0).getReg()); - APInt Mask = - APInt::getLowBitsSet(WideTy.getSizeInBits(), OrigTy.getSizeInBits()); - auto AndOp = MIRBuilder.buildAnd( - WideTy, NewOp, MIRBuilder.buildConstant(WideTy, Mask)); - // There is no overflow if the AndOp is the same as NewOp. - MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, AndOp); - // Now trunc the NewOp to the original result. - MIRBuilder.buildTrunc(MI.getOperand(0), NewOp); - MI.eraseFromParent(); - return Legalized; - } + case TargetOpcode::G_USUBO: + return widenScalarAddoSubo(MI, TypeIdx, WideTy); case TargetOpcode::G_SADDSAT: case TargetOpcode::G_SSUBSAT: + case TargetOpcode::G_SSHLSAT: case TargetOpcode::G_UADDSAT: case TargetOpcode::G_USUBSAT: - return widenScalarAddSubSat(MI, TypeIdx, WideTy); + case TargetOpcode::G_USHLSAT: + return widenScalarAddSubShlSat(MI, TypeIdx, WideTy); case TargetOpcode::G_CTTZ: case TargetOpcode::G_CTTZ_ZERO_UNDEF: case TargetOpcode::G_CTLZ: @@ -1908,21 +2045,25 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_SITOFP: - if (TypeIdx != 1) - return UnableToLegalize; Observer.changingInstr(MI); - widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); + + if (TypeIdx == 0) + widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); + else + widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); + Observer.changedInstr(MI); return Legalized; - case TargetOpcode::G_UITOFP: - if (TypeIdx != 1) - return UnableToLegalize; Observer.changingInstr(MI); - widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); + + if (TypeIdx == 0) + widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); + else + widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); + Observer.changedInstr(MI); return Legalized; - case TargetOpcode::G_LOAD: case TargetOpcode::G_SEXTLOAD: case TargetOpcode::G_ZEXTLOAD: @@ -1936,7 +2077,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return UnableToLegalize; LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - if (!isPowerOf2_32(Ty.getSizeInBits())) + if (!Ty.isScalar()) return UnableToLegalize; Observer.changingInstr(MI); @@ -2134,6 +2275,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_FPOW: case TargetOpcode::G_INTRINSIC_TRUNC: case TargetOpcode::G_INTRINSIC_ROUND: + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: assert(TypeIdx == 0); Observer.changingInstr(MI); @@ -2143,6 +2285,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); Observer.changedInstr(MI); return Legalized; + case TargetOpcode::G_FPOWI: { + if (TypeIdx != 0) + return UnableToLegalize; + Observer.changingInstr(MI); + widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); + widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); + Observer.changedInstr(MI); + return Legalized; + } case TargetOpcode::G_INTTOPTR: if (TypeIdx != 1) return UnableToLegalize; @@ -2169,8 +2320,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { // Avoid changing the result vector type if the source element type was // requested. if (TypeIdx == 1) { - auto &TII = *MI.getMF()->getSubtarget().getInstrInfo(); - MI.setDesc(TII.get(TargetOpcode::G_BUILD_VECTOR_TRUNC)); + MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC)); } else { widenScalarDst(MI, WideTy, 0); } @@ -2273,6 +2423,376 @@ LegalizerHelper::lowerBitcast(MachineInstr &MI) { return UnableToLegalize; } +/// Figure out the bit offset into a register when coercing a vector index for +/// the wide element type. This is only for the case when promoting vector to +/// one with larger elements. +// +/// +/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) +/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) +static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B, + Register Idx, + unsigned NewEltSize, + unsigned OldEltSize) { + const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); + LLT IdxTy = B.getMRI()->getType(Idx); + + // Now figure out the amount we need to shift to get the target bits. + auto OffsetMask = B.buildConstant( + IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio)); + auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask); + return B.buildShl(IdxTy, OffsetIdx, + B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0); +} + +/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this +/// is casting to a vector with a smaller element size, perform multiple element +/// extracts and merge the results. If this is coercing to a vector with larger +/// elements, index the bitcasted vector and extract the target element with bit +/// operations. This is intended to force the indexing in the native register +/// size for architectures that can dynamically index the register file. +LegalizerHelper::LegalizeResult +LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy) { + if (TypeIdx != 1) + return UnableToLegalize; + + Register Dst = MI.getOperand(0).getReg(); + Register SrcVec = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(2).getReg(); + LLT SrcVecTy = MRI.getType(SrcVec); + LLT IdxTy = MRI.getType(Idx); + + LLT SrcEltTy = SrcVecTy.getElementType(); + unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; + unsigned OldNumElts = SrcVecTy.getNumElements(); + + LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; + Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); + + const unsigned NewEltSize = NewEltTy.getSizeInBits(); + const unsigned OldEltSize = SrcEltTy.getSizeInBits(); + if (NewNumElts > OldNumElts) { + // Decreasing the vector element size + // + // e.g. i64 = extract_vector_elt x:v2i64, y:i32 + // => + // v4i32:castx = bitcast x:v2i64 + // + // i64 = bitcast + // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))), + // (i32 (extract_vector_elt castx, (2 * y + 1))) + // + if (NewNumElts % OldNumElts != 0) + return UnableToLegalize; + + // Type of the intermediate result vector. + const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts; + LLT MidTy = LLT::scalarOrVector(NewEltsPerOldElt, NewEltTy); + + auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt); + + SmallVector<Register, 8> NewOps(NewEltsPerOldElt); + auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK); + + for (unsigned I = 0; I < NewEltsPerOldElt; ++I) { + auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I); + auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset); + auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx); + NewOps[I] = Elt.getReg(0); + } + + auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps); + MIRBuilder.buildBitcast(Dst, NewVec); + MI.eraseFromParent(); + return Legalized; + } + + if (NewNumElts < OldNumElts) { + if (NewEltSize % OldEltSize != 0) + return UnableToLegalize; + + // This only depends on powers of 2 because we use bit tricks to figure out + // the bit offset we need to shift to get the target element. A general + // expansion could emit division/multiply. + if (!isPowerOf2_32(NewEltSize / OldEltSize)) + return UnableToLegalize; + + // Increasing the vector element size. + // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx + // + // => + // + // %cast = G_BITCAST %vec + // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize) + // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx + // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) + // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) + // %elt_bits = G_LSHR %wide_elt, %offset_bits + // %elt = G_TRUNC %elt_bits + + const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); + auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); + + // Divide to get the index in the wider element type. + auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); + + Register WideElt = CastVec; + if (CastTy.isVector()) { + WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, + ScaledIdx).getReg(0); + } + + // Compute the bit offset into the register of the target element. + Register OffsetBits = getBitcastWiderVectorElementOffset( + MIRBuilder, Idx, NewEltSize, OldEltSize); + + // Shift the wide element to get the target element. + auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits); + MIRBuilder.buildTrunc(Dst, ExtractedBits); + MI.eraseFromParent(); + return Legalized; + } + + return UnableToLegalize; +} + +/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p +/// TargetReg, while preserving other bits in \p TargetReg. +/// +/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset) +static Register buildBitFieldInsert(MachineIRBuilder &B, + Register TargetReg, Register InsertReg, + Register OffsetBits) { + LLT TargetTy = B.getMRI()->getType(TargetReg); + LLT InsertTy = B.getMRI()->getType(InsertReg); + auto ZextVal = B.buildZExt(TargetTy, InsertReg); + auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits); + + // Produce a bitmask of the value to insert + auto EltMask = B.buildConstant( + TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(), + InsertTy.getSizeInBits())); + // Shift it into position + auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits); + auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask); + + // Clear out the bits in the wide element + auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask); + + // The value to insert has all zeros already, so stick it into the masked + // wide element. + return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0); +} + +/// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this +/// is increasing the element size, perform the indexing in the target element +/// type, and use bit operations to insert at the element position. This is +/// intended for architectures that can dynamically index the register file and +/// want to force indexing in the native register size. +LegalizerHelper::LegalizeResult +LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy) { + if (TypeIdx != 0) + return UnableToLegalize; + + Register Dst = MI.getOperand(0).getReg(); + Register SrcVec = MI.getOperand(1).getReg(); + Register Val = MI.getOperand(2).getReg(); + Register Idx = MI.getOperand(3).getReg(); + + LLT VecTy = MRI.getType(Dst); + LLT IdxTy = MRI.getType(Idx); + + LLT VecEltTy = VecTy.getElementType(); + LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; + const unsigned NewEltSize = NewEltTy.getSizeInBits(); + const unsigned OldEltSize = VecEltTy.getSizeInBits(); + + unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; + unsigned OldNumElts = VecTy.getNumElements(); + + Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); + if (NewNumElts < OldNumElts) { + if (NewEltSize % OldEltSize != 0) + return UnableToLegalize; + + // This only depends on powers of 2 because we use bit tricks to figure out + // the bit offset we need to shift to get the target element. A general + // expansion could emit division/multiply. + if (!isPowerOf2_32(NewEltSize / OldEltSize)) + return UnableToLegalize; + + const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); + auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); + + // Divide to get the index in the wider element type. + auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); + + Register ExtractedElt = CastVec; + if (CastTy.isVector()) { + ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, + ScaledIdx).getReg(0); + } + + // Compute the bit offset into the register of the target element. + Register OffsetBits = getBitcastWiderVectorElementOffset( + MIRBuilder, Idx, NewEltSize, OldEltSize); + + Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt, + Val, OffsetBits); + if (CastTy.isVector()) { + InsertedElt = MIRBuilder.buildInsertVectorElement( + CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0); + } + + MIRBuilder.buildBitcast(Dst, InsertedElt); + MI.eraseFromParent(); + return Legalized; + } + + return UnableToLegalize; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerLoad(MachineInstr &MI) { + // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT + Register DstReg = MI.getOperand(0).getReg(); + Register PtrReg = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(DstReg); + auto &MMO = **MI.memoperands_begin(); + + if (DstTy.getSizeInBits() == MMO.getSizeInBits()) { + if (MI.getOpcode() == TargetOpcode::G_LOAD) { + // This load needs splitting into power of 2 sized loads. + if (DstTy.isVector()) + return UnableToLegalize; + if (isPowerOf2_32(DstTy.getSizeInBits())) + return UnableToLegalize; // Don't know what we're being asked to do. + + // Our strategy here is to generate anyextending loads for the smaller + // types up to next power-2 result type, and then combine the two larger + // result values together, before truncating back down to the non-pow-2 + // type. + // E.g. v1 = i24 load => + // v2 = i32 zextload (2 byte) + // v3 = i32 load (1 byte) + // v4 = i32 shl v3, 16 + // v5 = i32 or v4, v2 + // v1 = i24 trunc v5 + // By doing this we generate the correct truncate which should get + // combined away as an artifact with a matching extend. + uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits()); + uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize; + + MachineFunction &MF = MIRBuilder.getMF(); + MachineMemOperand *LargeMMO = + MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); + MachineMemOperand *SmallMMO = MF.getMachineMemOperand( + &MMO, LargeSplitSize / 8, SmallSplitSize / 8); + + LLT PtrTy = MRI.getType(PtrReg); + unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits()); + LLT AnyExtTy = LLT::scalar(AnyExtSize); + Register LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy); + Register SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy); + auto LargeLoad = MIRBuilder.buildLoadInstr( + TargetOpcode::G_ZEXTLOAD, LargeLdReg, PtrReg, *LargeMMO); + + auto OffsetCst = MIRBuilder.buildConstant( + LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); + Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); + auto SmallPtr = + MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst.getReg(0)); + auto SmallLoad = MIRBuilder.buildLoad(SmallLdReg, SmallPtr.getReg(0), + *SmallMMO); + + auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize); + auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt); + auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad); + MIRBuilder.buildTrunc(DstReg, {Or.getReg(0)}); + MI.eraseFromParent(); + return Legalized; + } + + MIRBuilder.buildLoad(DstReg, PtrReg, MMO); + MI.eraseFromParent(); + return Legalized; + } + + if (DstTy.isScalar()) { + Register TmpReg = + MRI.createGenericVirtualRegister(LLT::scalar(MMO.getSizeInBits())); + MIRBuilder.buildLoad(TmpReg, PtrReg, MMO); + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected opcode"); + case TargetOpcode::G_LOAD: + MIRBuilder.buildAnyExtOrTrunc(DstReg, TmpReg); + break; + case TargetOpcode::G_SEXTLOAD: + MIRBuilder.buildSExt(DstReg, TmpReg); + break; + case TargetOpcode::G_ZEXTLOAD: + MIRBuilder.buildZExt(DstReg, TmpReg); + break; + } + + MI.eraseFromParent(); + return Legalized; + } + + return UnableToLegalize; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerStore(MachineInstr &MI) { + // Lower a non-power of 2 store into multiple pow-2 stores. + // E.g. split an i24 store into an i16 store + i8 store. + // We do this by first extending the stored value to the next largest power + // of 2 type, and then using truncating stores to store the components. + // By doing this, likewise with G_LOAD, generate an extend that can be + // artifact-combined away instead of leaving behind extracts. + Register SrcReg = MI.getOperand(0).getReg(); + Register PtrReg = MI.getOperand(1).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + MachineMemOperand &MMO = **MI.memoperands_begin(); + if (SrcTy.getSizeInBits() != MMO.getSizeInBits()) + return UnableToLegalize; + if (SrcTy.isVector()) + return UnableToLegalize; + if (isPowerOf2_32(SrcTy.getSizeInBits())) + return UnableToLegalize; // Don't know what we're being asked to do. + + // Extend to the next pow-2. + const LLT ExtendTy = LLT::scalar(NextPowerOf2(SrcTy.getSizeInBits())); + auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg); + + // Obtain the smaller value by shifting away the larger value. + uint64_t LargeSplitSize = PowerOf2Floor(SrcTy.getSizeInBits()); + uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize; + auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize); + auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt); + + // Generate the PtrAdd and truncating stores. + LLT PtrTy = MRI.getType(PtrReg); + auto OffsetCst = MIRBuilder.buildConstant( + LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); + Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); + auto SmallPtr = + MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst.getReg(0)); + + MachineFunction &MF = MIRBuilder.getMF(); + MachineMemOperand *LargeMMO = + MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); + MachineMemOperand *SmallMMO = + MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8); + MIRBuilder.buildStore(ExtVal.getReg(0), PtrReg, *LargeMMO); + MIRBuilder.buildStore(SmallVal.getReg(0), SmallPtr.getReg(0), *SmallMMO); + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { switch (MI.getOpcode()) { @@ -2321,13 +2841,24 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + return bitcastExtractVectorElt(MI, TypeIdx, CastTy); + case TargetOpcode::G_INSERT_VECTOR_ELT: + return bitcastInsertVectorElt(MI, TypeIdx, CastTy); default: return UnableToLegalize; } } +// Legalize an instruction by changing the opcode in place. +void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) { + Observer.changingInstr(MI); + MI.setDesc(MIRBuilder.getTII().get(NewOpcode)); + Observer.changedInstr(MI); +} + LegalizerHelper::LegalizeResult -LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { +LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { using namespace TargetOpcode; switch(MI.getOpcode()) { @@ -2337,6 +2868,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return lowerBitcast(MI); case TargetOpcode::G_SREM: case TargetOpcode::G_UREM: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); auto Quot = MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty}, {MI.getOperand(1), MI.getOperand(2)}); @@ -2349,6 +2881,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { case TargetOpcode::G_SADDO: case TargetOpcode::G_SSUBO: return lowerSADDO_SSUBO(MI); + case TargetOpcode::G_UMULH: + case TargetOpcode::G_SMULH: + return lowerSMULH_UMULH(MI); case TargetOpcode::G_SMULO: case TargetOpcode::G_UMULO: { // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the @@ -2357,6 +2892,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { Register Overflow = MI.getOperand(1).getReg(); Register LHS = MI.getOperand(2).getReg(); Register RHS = MI.getOperand(3).getReg(); + LLT Ty = MRI.getType(Res); unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO ? TargetOpcode::G_SMULH @@ -2386,31 +2922,29 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return Legalized; } case TargetOpcode::G_FNEG: { + Register Res = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Res); + // TODO: Handle vector types once we are able to // represent them. if (Ty.isVector()) return UnableToLegalize; - Register Res = MI.getOperand(0).getReg(); - LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext(); - Type *ZeroTy = getFloatTypeForLLT(Ctx, Ty); - if (!ZeroTy) - return UnableToLegalize; - ConstantFP &ZeroForNegation = - *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy)); - auto Zero = MIRBuilder.buildFConstant(Ty, ZeroForNegation); + auto SignMask = + MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits())); Register SubByReg = MI.getOperand(1).getReg(); - Register ZeroReg = Zero.getReg(0); - MIRBuilder.buildFSub(Res, ZeroReg, SubByReg, MI.getFlags()); + MIRBuilder.buildXor(Res, SubByReg, SignMask); MI.eraseFromParent(); return Legalized; } case TargetOpcode::G_FSUB: { + Register Res = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Res); + // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)). // First, check if G_FNEG is marked as Lower. If so, we may // end up with an infinite loop as G_FSUB is used to legalize G_FNEG. if (LI.getAction({G_FNEG, {Ty}}).Action == Lower) return UnableToLegalize; - Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); Register Neg = MRI.createGenericVirtualRegister(Ty); @@ -2425,6 +2959,12 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return lowerFFloor(MI); case TargetOpcode::G_INTRINSIC_ROUND: return lowerIntrinsicRound(MI); + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { + // Since round even is the assumed rounding mode for unconstrained FP + // operations, rint and roundeven are the same operation. + changeOpcode(MI, TargetOpcode::G_FRINT); + return Legalized; + } case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { Register OldValRes = MI.getOperand(0).getReg(); Register SuccessRes = MI.getOperand(1).getReg(); @@ -2439,145 +2979,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { } case TargetOpcode::G_LOAD: case TargetOpcode::G_SEXTLOAD: - case TargetOpcode::G_ZEXTLOAD: { - // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT - Register DstReg = MI.getOperand(0).getReg(); - Register PtrReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - auto &MMO = **MI.memoperands_begin(); - - if (DstTy.getSizeInBits() == MMO.getSizeInBits()) { - if (MI.getOpcode() == TargetOpcode::G_LOAD) { - // This load needs splitting into power of 2 sized loads. - if (DstTy.isVector()) - return UnableToLegalize; - if (isPowerOf2_32(DstTy.getSizeInBits())) - return UnableToLegalize; // Don't know what we're being asked to do. - - // Our strategy here is to generate anyextending loads for the smaller - // types up to next power-2 result type, and then combine the two larger - // result values together, before truncating back down to the non-pow-2 - // type. - // E.g. v1 = i24 load => - // v2 = i32 zextload (2 byte) - // v3 = i32 load (1 byte) - // v4 = i32 shl v3, 16 - // v5 = i32 or v4, v2 - // v1 = i24 trunc v5 - // By doing this we generate the correct truncate which should get - // combined away as an artifact with a matching extend. - uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits()); - uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize; - - MachineFunction &MF = MIRBuilder.getMF(); - MachineMemOperand *LargeMMO = - MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); - MachineMemOperand *SmallMMO = MF.getMachineMemOperand( - &MMO, LargeSplitSize / 8, SmallSplitSize / 8); - - LLT PtrTy = MRI.getType(PtrReg); - unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits()); - LLT AnyExtTy = LLT::scalar(AnyExtSize); - Register LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy); - Register SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy); - auto LargeLoad = MIRBuilder.buildLoadInstr( - TargetOpcode::G_ZEXTLOAD, LargeLdReg, PtrReg, *LargeMMO); - - auto OffsetCst = MIRBuilder.buildConstant( - LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); - Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); - auto SmallPtr = - MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst.getReg(0)); - auto SmallLoad = MIRBuilder.buildLoad(SmallLdReg, SmallPtr.getReg(0), - *SmallMMO); - - auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize); - auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt); - auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad); - MIRBuilder.buildTrunc(DstReg, {Or.getReg(0)}); - MI.eraseFromParent(); - return Legalized; - } - MIRBuilder.buildLoad(DstReg, PtrReg, MMO); - MI.eraseFromParent(); - return Legalized; - } - - if (DstTy.isScalar()) { - Register TmpReg = - MRI.createGenericVirtualRegister(LLT::scalar(MMO.getSizeInBits())); - MIRBuilder.buildLoad(TmpReg, PtrReg, MMO); - switch (MI.getOpcode()) { - default: - llvm_unreachable("Unexpected opcode"); - case TargetOpcode::G_LOAD: - MIRBuilder.buildExtOrTrunc(TargetOpcode::G_ANYEXT, DstReg, TmpReg); - break; - case TargetOpcode::G_SEXTLOAD: - MIRBuilder.buildSExt(DstReg, TmpReg); - break; - case TargetOpcode::G_ZEXTLOAD: - MIRBuilder.buildZExt(DstReg, TmpReg); - break; - } - MI.eraseFromParent(); - return Legalized; - } - - return UnableToLegalize; - } - case TargetOpcode::G_STORE: { - // Lower a non-power of 2 store into multiple pow-2 stores. - // E.g. split an i24 store into an i16 store + i8 store. - // We do this by first extending the stored value to the next largest power - // of 2 type, and then using truncating stores to store the components. - // By doing this, likewise with G_LOAD, generate an extend that can be - // artifact-combined away instead of leaving behind extracts. - Register SrcReg = MI.getOperand(0).getReg(); - Register PtrReg = MI.getOperand(1).getReg(); - LLT SrcTy = MRI.getType(SrcReg); - MachineMemOperand &MMO = **MI.memoperands_begin(); - if (SrcTy.getSizeInBits() != MMO.getSizeInBits()) - return UnableToLegalize; - if (SrcTy.isVector()) - return UnableToLegalize; - if (isPowerOf2_32(SrcTy.getSizeInBits())) - return UnableToLegalize; // Don't know what we're being asked to do. - - // Extend to the next pow-2. - const LLT ExtendTy = LLT::scalar(NextPowerOf2(SrcTy.getSizeInBits())); - auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg); - - // Obtain the smaller value by shifting away the larger value. - uint64_t LargeSplitSize = PowerOf2Floor(SrcTy.getSizeInBits()); - uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize; - auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize); - auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt); - - // Generate the PtrAdd and truncating stores. - LLT PtrTy = MRI.getType(PtrReg); - auto OffsetCst = MIRBuilder.buildConstant( - LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); - Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); - auto SmallPtr = - MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst.getReg(0)); - - MachineFunction &MF = MIRBuilder.getMF(); - MachineMemOperand *LargeMMO = - MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); - MachineMemOperand *SmallMMO = - MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8); - MIRBuilder.buildStore(ExtVal.getReg(0), PtrReg, *LargeMMO); - MIRBuilder.buildStore(SmallVal.getReg(0), SmallPtr.getReg(0), *SmallMMO); - MI.eraseFromParent(); - return Legalized; - } + case TargetOpcode::G_ZEXTLOAD: + return lowerLoad(MI); + case TargetOpcode::G_STORE: + return lowerStore(MI); case TargetOpcode::G_CTLZ_ZERO_UNDEF: case TargetOpcode::G_CTTZ_ZERO_UNDEF: case TargetOpcode::G_CTLZ: case TargetOpcode::G_CTTZ: case TargetOpcode::G_CTPOP: - return lowerBitCount(MI, TypeIdx, Ty); + return lowerBitCount(MI); case G_UADDO: { Register Res = MI.getOperand(0).getReg(); Register CarryOut = MI.getOperand(1).getReg(); @@ -2639,22 +3050,24 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return Legalized; } case G_UITOFP: - return lowerUITOFP(MI, TypeIdx, Ty); + return lowerUITOFP(MI); case G_SITOFP: - return lowerSITOFP(MI, TypeIdx, Ty); + return lowerSITOFP(MI); case G_FPTOUI: - return lowerFPTOUI(MI, TypeIdx, Ty); + return lowerFPTOUI(MI); case G_FPTOSI: return lowerFPTOSI(MI); case G_FPTRUNC: - return lowerFPTRUNC(MI, TypeIdx, Ty); + return lowerFPTRUNC(MI); + case G_FPOWI: + return lowerFPOWI(MI); case G_SMIN: case G_SMAX: case G_UMIN: case G_UMAX: - return lowerMinMax(MI, TypeIdx, Ty); + return lowerMinMax(MI); case G_FCOPYSIGN: - return lowerFCopySign(MI, TypeIdx, Ty); + return lowerFCopySign(MI); case G_FMINNUM: case G_FMAXNUM: return lowerFMinNumMaxNum(MI); @@ -2677,6 +3090,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { MI.eraseFromParent(); return Legalized; } + case G_EXTRACT_VECTOR_ELT: + case G_INSERT_VECTOR_ELT: + return lowerExtractInsertVectorElt(MI); case G_SHUFFLE_VECTOR: return lowerShuffleVector(MI); case G_DYN_STACKALLOC: @@ -2692,33 +3108,123 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { case G_READ_REGISTER: case G_WRITE_REGISTER: return lowerReadWriteRegister(MI); + case G_UADDSAT: + case G_USUBSAT: { + // Try to make a reasonable guess about which lowering strategy to use. The + // target can override this with custom lowering and calling the + // implementation functions. + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + if (LI.isLegalOrCustom({G_UMIN, Ty})) + return lowerAddSubSatToMinMax(MI); + return lowerAddSubSatToAddoSubo(MI); + } + case G_SADDSAT: + case G_SSUBSAT: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + + // FIXME: It would probably make more sense to see if G_SADDO is preferred, + // since it's a shorter expansion. However, we would need to figure out the + // preferred boolean type for the carry out for the query. + if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty})) + return lowerAddSubSatToMinMax(MI); + return lowerAddSubSatToAddoSubo(MI); + } + case G_SSHLSAT: + case G_USHLSAT: + return lowerShlSat(MI); + case G_ABS: { + // Expand %res = G_ABS %a into: + // %v1 = G_ASHR %a, scalar_size-1 + // %v2 = G_ADD %a, %v1 + // %res = G_XOR %v2, %v1 + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + Register OpReg = MI.getOperand(1).getReg(); + auto ShiftAmt = + MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1); + auto Shift = + MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt); + auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift); + MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift); + MI.eraseFromParent(); + return Legalized; + } + case G_SELECT: + return lowerSelect(MI); } } +Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty, + Align MinAlign) const { + // FIXME: We're missing a way to go back from LLT to llvm::Type to query the + // datalayout for the preferred alignment. Also there should be a target hook + // for this to allow targets to reduce the alignment and ignore the + // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of + // the type. + return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign); +} + +MachineInstrBuilder +LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment, + MachinePointerInfo &PtrInfo) { + MachineFunction &MF = MIRBuilder.getMF(); + const DataLayout &DL = MIRBuilder.getDataLayout(); + int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false); + + unsigned AddrSpace = DL.getAllocaAddrSpace(); + LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); + + PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx); + return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx); +} + +static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg, + LLT VecTy) { + int64_t IdxVal; + if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal))) + return IdxReg; + + LLT IdxTy = B.getMRI()->getType(IdxReg); + unsigned NElts = VecTy.getNumElements(); + if (isPowerOf2_32(NElts)) { + APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts)); + return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0); + } + + return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1)) + .getReg(0); +} + +Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy, + Register Index) { + LLT EltTy = VecTy.getElementType(); + + // Calculate the element offset and add it to the pointer. + unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size. + assert(EltSize * 8 == EltTy.getSizeInBits() && + "Converting bits to bytes lost precision"); + + Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy); + + LLT IdxTy = MRI.getType(Index); + auto Mul = MIRBuilder.buildMul(IdxTy, Index, + MIRBuilder.buildConstant(IdxTy, EltSize)); + + LLT PtrTy = MRI.getType(VecPtr); + return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0); +} + LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef( MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) { - SmallVector<Register, 2> DstRegs; - - unsigned NarrowSize = NarrowTy.getSizeInBits(); Register DstReg = MI.getOperand(0).getReg(); - unsigned Size = MRI.getType(DstReg).getSizeInBits(); - int NumParts = Size / NarrowSize; - // FIXME: Don't know how to handle the situation where the small vectors - // aren't all the same size yet. - if (Size % NarrowSize != 0) - return UnableToLegalize; + LLT DstTy = MRI.getType(DstReg); + LLT LCMTy = getLCMType(DstTy, NarrowTy); - for (int i = 0; i < NumParts; ++i) { - Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); - MIRBuilder.buildUndef(TmpReg); - DstRegs.push_back(TmpReg); - } + unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits(); - if (NarrowTy.isVector()) - MIRBuilder.buildConcatVectors(DstReg, DstRegs); - else - MIRBuilder.buildBuildVector(DstReg, DstRegs); + auto NewUndef = MIRBuilder.buildUndef(NarrowTy); + SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0)); + buildWidenedRemergeToDst(DstReg, LCMTy, Parts); MI.eraseFromParent(); return Legalized; } @@ -2839,7 +3345,7 @@ LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx, if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements()) return UnableToLegalize; - NarrowTy1 = LLT::vector(NumParts, SrcTy.getElementType().getSizeInBits()); + NarrowTy1 = LLT::vector(NarrowTy.getNumElements(), SrcTy.getElementType()); } else { NumParts = DstTy.getNumElements(); NarrowTy1 = SrcTy.getElementType(); @@ -3112,63 +3618,116 @@ LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI, return Legalized; } +// Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces +// a vector +// +// Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with +// undef as necessary. +// +// %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2 +// -> <2 x s16> +// +// %4:_(s16) = G_IMPLICIT_DEF +// %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1 +// %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4 +// %7:_(<2 x s16>) = G_IMPLICIT_DEF +// %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7 +// %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8 LegalizerHelper::LegalizeResult -LegalizerHelper::fewerElementsVectorBuildVector(MachineInstr &MI, - unsigned TypeIdx, - LLT NarrowTy) { - assert(TypeIdx == 0 && "not a vector type index"); +LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx, + LLT NarrowTy) { Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = DstTy.getElementType(); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy); - int DstNumElts = DstTy.getNumElements(); - int NarrowNumElts = NarrowTy.getNumElements(); - int NumConcat = (DstNumElts + NarrowNumElts - 1) / NarrowNumElts; - LLT WidenedDstTy = LLT::vector(NarrowNumElts * NumConcat, SrcTy); + // Break into a common type + SmallVector<Register, 16> Parts; + for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) + extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg()); - SmallVector<Register, 8> ConcatOps; - SmallVector<Register, 8> SubBuildVector; + // Build the requested new merge, padding with undef. + LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, + TargetOpcode::G_ANYEXT); - Register UndefReg; - if (WidenedDstTy != DstTy) - UndefReg = MIRBuilder.buildUndef(SrcTy).getReg(0); + // Pack into the original result register. + buildWidenedRemergeToDst(DstReg, LCMTy, Parts); - // Create a G_CONCAT_VECTORS of NarrowTy pieces, padding with undef as - // necessary. - // - // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2 - // -> <2 x s16> - // - // %4:_(s16) = G_IMPLICIT_DEF - // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1 - // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4 - // %7:_(<4 x s16>) = G_CONCAT_VECTORS %5, %6 - // %3:_(<3 x s16>) = G_EXTRACT %7, 0 - for (int I = 0; I != NumConcat; ++I) { - for (int J = 0; J != NarrowNumElts; ++J) { - int SrcIdx = NarrowNumElts * I + J; - - if (SrcIdx < DstNumElts) { - Register SrcReg = MI.getOperand(SrcIdx + 1).getReg(); - SubBuildVector.push_back(SrcReg); - } else - SubBuildVector.push_back(UndefReg); + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, + unsigned TypeIdx, + LLT NarrowVecTy) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcVec = MI.getOperand(1).getReg(); + Register InsertVal; + bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT; + + assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index"); + if (IsInsert) + InsertVal = MI.getOperand(2).getReg(); + + Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); + + // TODO: Handle total scalarization case. + if (!NarrowVecTy.isVector()) + return UnableToLegalize; + + LLT VecTy = MRI.getType(SrcVec); + + // If the index is a constant, we can really break this down as you would + // expect, and index into the target size pieces. + int64_t IdxVal; + if (mi_match(Idx, MRI, m_ICst(IdxVal))) { + // Avoid out of bounds indexing the pieces. + if (IdxVal >= VecTy.getNumElements()) { + MIRBuilder.buildUndef(DstReg); + MI.eraseFromParent(); + return Legalized; } - auto BuildVec = MIRBuilder.buildBuildVector(NarrowTy, SubBuildVector); - ConcatOps.push_back(BuildVec.getReg(0)); - SubBuildVector.clear(); - } + SmallVector<Register, 8> VecParts; + LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec); + + // Build a sequence of NarrowTy pieces in VecParts for this operand. + LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts, + TargetOpcode::G_ANYEXT); + + unsigned NewNumElts = NarrowVecTy.getNumElements(); + + LLT IdxTy = MRI.getType(Idx); + int64_t PartIdx = IdxVal / NewNumElts; + auto NewIdx = + MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx); - if (DstTy == WidenedDstTy) - MIRBuilder.buildConcatVectors(DstReg, ConcatOps); - else { - auto Concat = MIRBuilder.buildConcatVectors(WidenedDstTy, ConcatOps); - MIRBuilder.buildExtract(DstReg, Concat, 0); + if (IsInsert) { + LLT PartTy = MRI.getType(VecParts[PartIdx]); + + // Use the adjusted index to insert into one of the subvectors. + auto InsertPart = MIRBuilder.buildInsertVectorElement( + PartTy, VecParts[PartIdx], InsertVal, NewIdx); + VecParts[PartIdx] = InsertPart.getReg(0); + + // Recombine the inserted subvector with the others to reform the result + // vector. + buildWidenedRemergeToDst(DstReg, LCMTy, VecParts); + } else { + MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx); + } + + MI.eraseFromParent(); + return Legalized; } - MI.eraseFromParent(); - return Legalized; + // With a variable index, we can't perform the operation in a smaller type, so + // we're forced to expand this. + // + // TODO: We could emit a chain of compare/select to figure out which piece to + // index. + return lowerExtractInsertVectorElt(MI); } LegalizerHelper::LegalizeResult @@ -3214,7 +3773,8 @@ LegalizerHelper::reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx, if (NumParts == -1) return UnableToLegalize; - const LLT OffsetTy = LLT::scalar(MRI.getType(AddrReg).getScalarSizeInBits()); + LLT PtrTy = MRI.getType(AddrReg); + const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); unsigned TotalSize = ValTy.getSizeInBits(); @@ -3412,6 +3972,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_ADD: case G_SUB: case G_MUL: + case G_PTR_ADD: case G_SMULH: case G_UMULH: case G_FADD: @@ -3435,6 +3996,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_FFLOOR: case G_FRINT: case G_INTRINSIC_ROUND: + case G_INTRINSIC_ROUNDEVEN: case G_INTRINSIC_TRUNC: case G_FCOS: case G_FSIN: @@ -3466,6 +4028,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_SHL: case G_LSHR: case G_ASHR: + case G_SSHLSAT: + case G_USHLSAT: case G_CTLZ: case G_CTLZ_ZERO_UNDEF: case G_CTTZ: @@ -3496,7 +4060,15 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_UNMERGE_VALUES: return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy); case G_BUILD_VECTOR: - return fewerElementsVectorBuildVector(MI, TypeIdx, NarrowTy); + assert(TypeIdx == 0 && "not a vector type index"); + return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy); + case G_CONCAT_VECTORS: + if (TypeIdx != 1) // TODO: This probably does work as expected already. + return UnableToLegalize; + return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy); + case G_EXTRACT_VECTOR_ELT: + case G_INSERT_VECTOR_ELT: + return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy); case G_LOAD: case G_STORE: return reduceLoadStoreWidth(MI, TypeIdx, NarrowTy); @@ -4268,9 +4840,9 @@ LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, } LegalizerHelper::LegalizeResult -LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { +LegalizerHelper::lowerBitCount(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); - auto &TII = *MI.getMF()->getSubtarget().getInstrInfo(); + const auto &TII = MIRBuilder.getTII(); auto isSupported = [this](const LegalityQuery &Q) { auto QAction = LI.getAction(Q).Action; return QAction == Legal || QAction == Libcall || QAction == Custom; @@ -4358,15 +4930,15 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { // unless the target has ctlz but not ctpop, in which case we use: // { return 32 - nlz(~x & (x-1)); } // Ref: "Hacker's Delight" by Henry Warren - auto MIBCstNeg1 = MIRBuilder.buildConstant(Ty, -1); - auto MIBNot = MIRBuilder.buildXor(Ty, SrcReg, MIBCstNeg1); + auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1); + auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1); auto MIBTmp = MIRBuilder.buildAnd( - Ty, MIBNot, MIRBuilder.buildAdd(Ty, SrcReg, MIBCstNeg1)); - if (!isSupported({TargetOpcode::G_CTPOP, {Ty, Ty}}) && - isSupported({TargetOpcode::G_CTLZ, {Ty, Ty}})) { - auto MIBCstLen = MIRBuilder.buildConstant(Ty, Len); + SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1)); + if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) && + isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) { + auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len); MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen, - MIRBuilder.buildCTLZ(Ty, MIBTmp)); + MIRBuilder.buildCTLZ(SrcTy, MIBTmp)); MI.eraseFromParent(); return Legalized; } @@ -4375,6 +4947,8 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return Legalized; } case TargetOpcode::G_CTPOP: { + Register SrcReg = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(SrcReg); unsigned Size = Ty.getSizeInBits(); MachineIRBuilder &B = MIRBuilder; @@ -4384,11 +4958,11 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { // B2Count = val - { (val >> 1) & 0x55555555 } // since it gives same result in blocks of 2 with one instruction less. auto C_1 = B.buildConstant(Ty, 1); - auto B2Set1LoTo1Hi = B.buildLShr(Ty, MI.getOperand(1).getReg(), C_1); + auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1); APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55)); auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0); auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0); - auto B2Count = B.buildSub(Ty, MI.getOperand(1).getReg(), B2Count1Hi); + auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi); // In order to get count in blocks of 4 add values from adjacent block of 2. // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 } @@ -4487,8 +5061,7 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) { return Legalized; } -LegalizerHelper::LegalizeResult -LegalizerHelper::lowerUITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { +LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); LLT DstTy = MRI.getType(Dst); @@ -4516,8 +5089,7 @@ LegalizerHelper::lowerUITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return UnableToLegalize; } -LegalizerHelper::LegalizeResult -LegalizerHelper::lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { +LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); LLT DstTy = MRI.getType(Dst); @@ -4563,8 +5135,7 @@ LegalizerHelper::lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return UnableToLegalize; } -LegalizerHelper::LegalizeResult -LegalizerHelper::lowerFPTOUI(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { +LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); LLT DstTy = MRI.getType(Dst); @@ -4781,7 +5352,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { } LegalizerHelper::LegalizeResult -LegalizerHelper::lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { +LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); @@ -4796,6 +5367,20 @@ LegalizerHelper::lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return UnableToLegalize; } +// TODO: If RHS is a constant SelectionDAGBuilder expands this into a +// multiplication tree. +LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + LLT Ty = MRI.getType(Dst); + + auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1); + MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags()); + MI.eraseFromParent(); + return Legalized; +} + static CmpInst::Predicate minMaxToCompare(unsigned Opc) { switch (Opc) { case TargetOpcode::G_SMIN: @@ -4811,8 +5396,7 @@ static CmpInst::Predicate minMaxToCompare(unsigned Opc) { } } -LegalizerHelper::LegalizeResult -LegalizerHelper::lowerMinMax(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { +LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); Register Src0 = MI.getOperand(1).getReg(); Register Src1 = MI.getOperand(2).getReg(); @@ -4828,7 +5412,7 @@ LegalizerHelper::lowerMinMax(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { } LegalizerHelper::LegalizeResult -LegalizerHelper::lowerFCopySign(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { +LegalizerHelper::lowerFCopySign(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); Register Src0 = MI.getOperand(1).getReg(); Register Src1 = MI.getOperand(2).getReg(); @@ -5050,6 +5634,71 @@ LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) { return Legalized; } +/// Lower a vector extract or insert by writing the vector to a stack temporary +/// and reloading the element or vector. +/// +/// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx +/// => +/// %stack_temp = G_FRAME_INDEX +/// G_STORE %vec, %stack_temp +/// %idx = clamp(%idx, %vec.getNumElements()) +/// %element_ptr = G_PTR_ADD %stack_temp, %idx +/// %dst = G_LOAD %element_ptr +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcVec = MI.getOperand(1).getReg(); + Register InsertVal; + if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT) + InsertVal = MI.getOperand(2).getReg(); + + Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); + + LLT VecTy = MRI.getType(SrcVec); + LLT EltTy = VecTy.getElementType(); + if (!EltTy.isByteSized()) { // Not implemented. + LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n"); + return UnableToLegalize; + } + + unsigned EltBytes = EltTy.getSizeInBytes(); + Align VecAlign = getStackTemporaryAlignment(VecTy); + Align EltAlign; + + MachinePointerInfo PtrInfo; + auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()), + VecAlign, PtrInfo); + MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign); + + // Get the pointer to the element, and be sure not to hit undefined behavior + // if the index is out of bounds. + Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx); + + int64_t IdxVal; + if (mi_match(Idx, MRI, m_ICst(IdxVal))) { + int64_t Offset = IdxVal * EltBytes; + PtrInfo = PtrInfo.getWithOffset(Offset); + EltAlign = commonAlignment(VecAlign, Offset); + } else { + // We lose information with a variable offset. + EltAlign = getStackTemporaryAlignment(EltTy); + PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace()); + } + + if (InsertVal) { + // Write the inserted element + MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign); + + // Reload the whole vector. + MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign); + } else { + MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign); + } + + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { Register DstReg = MI.getOperand(0).getReg(); @@ -5120,7 +5769,6 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { LLT PtrTy = MRI.getType(Dst); LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); - const auto &TLI = *MF.getSubtarget().getTargetLowering(); Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg); SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp); @@ -5266,6 +5914,185 @@ LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) { } LegalizerHelper::LegalizeResult +LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) { + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + LLT Ty = MRI.getType(Res); + bool IsSigned; + bool IsAdd; + unsigned BaseOp; + switch (MI.getOpcode()) { + default: + llvm_unreachable("unexpected addsat/subsat opcode"); + case TargetOpcode::G_UADDSAT: + IsSigned = false; + IsAdd = true; + BaseOp = TargetOpcode::G_ADD; + break; + case TargetOpcode::G_SADDSAT: + IsSigned = true; + IsAdd = true; + BaseOp = TargetOpcode::G_ADD; + break; + case TargetOpcode::G_USUBSAT: + IsSigned = false; + IsAdd = false; + BaseOp = TargetOpcode::G_SUB; + break; + case TargetOpcode::G_SSUBSAT: + IsSigned = true; + IsAdd = false; + BaseOp = TargetOpcode::G_SUB; + break; + } + + if (IsSigned) { + // sadd.sat(a, b) -> + // hi = 0x7fffffff - smax(a, 0) + // lo = 0x80000000 - smin(a, 0) + // a + smin(smax(lo, b), hi) + // ssub.sat(a, b) -> + // lo = smax(a, -1) - 0x7fffffff + // hi = smin(a, -1) - 0x80000000 + // a - smin(smax(lo, b), hi) + // TODO: AMDGPU can use a "median of 3" instruction here: + // a +/- med3(lo, b, hi) + uint64_t NumBits = Ty.getScalarSizeInBits(); + auto MaxVal = + MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits)); + auto MinVal = + MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits)); + MachineInstrBuilder Hi, Lo; + if (IsAdd) { + auto Zero = MIRBuilder.buildConstant(Ty, 0); + Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero)); + Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero)); + } else { + auto NegOne = MIRBuilder.buildConstant(Ty, -1); + Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne), + MaxVal); + Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne), + MinVal); + } + auto RHSClamped = + MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi); + MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped}); + } else { + // uadd.sat(a, b) -> a + umin(~a, b) + // usub.sat(a, b) -> a - umin(a, b) + Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS; + auto Min = MIRBuilder.buildUMin(Ty, Not, RHS); + MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min}); + } + + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) { + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + LLT Ty = MRI.getType(Res); + LLT BoolTy = Ty.changeElementSize(1); + bool IsSigned; + bool IsAdd; + unsigned OverflowOp; + switch (MI.getOpcode()) { + default: + llvm_unreachable("unexpected addsat/subsat opcode"); + case TargetOpcode::G_UADDSAT: + IsSigned = false; + IsAdd = true; + OverflowOp = TargetOpcode::G_UADDO; + break; + case TargetOpcode::G_SADDSAT: + IsSigned = true; + IsAdd = true; + OverflowOp = TargetOpcode::G_SADDO; + break; + case TargetOpcode::G_USUBSAT: + IsSigned = false; + IsAdd = false; + OverflowOp = TargetOpcode::G_USUBO; + break; + case TargetOpcode::G_SSUBSAT: + IsSigned = true; + IsAdd = false; + OverflowOp = TargetOpcode::G_SSUBO; + break; + } + + auto OverflowRes = + MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS}); + Register Tmp = OverflowRes.getReg(0); + Register Ov = OverflowRes.getReg(1); + MachineInstrBuilder Clamp; + if (IsSigned) { + // sadd.sat(a, b) -> + // {tmp, ov} = saddo(a, b) + // ov ? (tmp >>s 31) + 0x80000000 : r + // ssub.sat(a, b) -> + // {tmp, ov} = ssubo(a, b) + // ov ? (tmp >>s 31) + 0x80000000 : r + uint64_t NumBits = Ty.getScalarSizeInBits(); + auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1); + auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount); + auto MinVal = + MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits)); + Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal); + } else { + // uadd.sat(a, b) -> + // {tmp, ov} = uaddo(a, b) + // ov ? 0xffffffff : tmp + // usub.sat(a, b) -> + // {tmp, ov} = usubo(a, b) + // ov ? 0 : tmp + Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0); + } + MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp); + + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerShlSat(MachineInstr &MI) { + assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT || + MI.getOpcode() == TargetOpcode::G_USHLSAT) && + "Expected shlsat opcode!"); + bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT; + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + LLT Ty = MRI.getType(Res); + LLT BoolTy = Ty.changeElementSize(1); + + unsigned BW = Ty.getScalarSizeInBits(); + auto Result = MIRBuilder.buildShl(Ty, LHS, RHS); + auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS) + : MIRBuilder.buildLShr(Ty, Result, RHS); + + MachineInstrBuilder SatVal; + if (IsSigned) { + auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW)); + auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW)); + auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS, + MIRBuilder.buildConstant(Ty, 0)); + SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax); + } else { + SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW)); + } + auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig); + MIRBuilder.buildSelect(Res, Ov, SatVal, Result); + + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); @@ -5345,8 +6172,6 @@ LegalizerHelper::lowerBitreverse(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) { MachineFunction &MF = MIRBuilder.getMF(); - const TargetSubtargetInfo &STI = MF.getSubtarget(); - const TargetLowering *TLI = STI.getTargetLowering(); bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER; int NameOpIdx = IsRead ? 1 : 0; @@ -5357,7 +6182,7 @@ LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) { const MDString *RegStr = cast<MDString>( cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0)); - Register PhysReg = TLI->getRegisterByName(RegStr->getString().data(), Ty, MF); + Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF); if (!PhysReg.isValid()) return UnableToLegalize; @@ -5369,3 +6194,63 @@ LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) { MI.eraseFromParent(); return Legalized; } + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) { + bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH; + unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; + Register Result = MI.getOperand(0).getReg(); + LLT OrigTy = MRI.getType(Result); + auto SizeInBits = OrigTy.getScalarSizeInBits(); + LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2); + + auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)}); + auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)}); + auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS); + unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR; + + auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits); + auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt}); + MIRBuilder.buildTrunc(Result, Shifted); + + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) { + // Implement vector G_SELECT in terms of XOR, AND, OR. + Register DstReg = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); + Register Op1Reg = MI.getOperand(2).getReg(); + Register Op2Reg = MI.getOperand(3).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT MaskTy = MRI.getType(MaskReg); + LLT Op1Ty = MRI.getType(Op1Reg); + if (!DstTy.isVector()) + return UnableToLegalize; + + // Vector selects can have a scalar predicate. If so, splat into a vector and + // finish for later legalization attempts to try again. + if (MaskTy.isScalar()) { + Register MaskElt = MaskReg; + if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits()) + MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0); + // Generate a vector splat idiom to be pattern matched later. + auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(ShufSplat.getReg(0)); + Observer.changedInstr(MI); + return Legalized; + } + + if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) { + return UnableToLegalize; + } + + auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg); + auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg); + auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask); + MIRBuilder.buildOr(DstReg, NewOp1, NewOp2); + MI.eraseFromParent(); + return Legalized; +} |