diff options
Diffstat (limited to 'lib/Target/AMDGPU')
126 files changed, 9840 insertions, 5142 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 19a8bd901629..b64422ae5427 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -188,6 +188,10 @@ ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true); ModulePass *createR600OpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); +ModulePass *createAMDGPUPrintfRuntimeBinding(); +void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&); +extern char &AMDGPUPrintfRuntimeBindingID; + ModulePass* createAMDGPUUnifyMetadataPass(); void initializeAMDGPUUnifyMetadataPass(PassRegistry&); extern char &AMDGPUUnifyMetadataID; diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index baeba534012c..42b477e07b3b 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -10,6 +10,15 @@ include "llvm/TableGen/SearchableTable.td" include "llvm/Target/Target.td" include "AMDGPUFeatures.td" +def p0 : PtrValueType<i64, 0>; +def p1 : PtrValueType<i64, 1>; +def p2 : PtrValueType<i32, 2>; +def p3 : PtrValueType<i32, 3>; +def p4 : PtrValueType<i64, 4>; +def p5 : PtrValueType<i32, 5>; +def p6 : PtrValueType<i32, 6>; + + class BoolToList<bit Value> { list<int> ret = !if(Value, [1]<int>, []<int>); } @@ -145,6 +154,12 @@ def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode" >; +def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug", + "HasMFMAInlineLiteralBug", + "true", + "MFMA cannot use inline literal as SrcC" +>; + def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard", "HasVcmpxPermlaneHazard", "true", @@ -802,6 +817,7 @@ def FeatureISAVersion9_0_8 : FeatureSet< FeaturePkFmacF16Inst, FeatureAtomicFaddInsts, FeatureSRAMECC, + FeatureMFMAInlineLiteralBug, FeatureCodeObjectV3]>; def FeatureISAVersion9_0_9 : FeatureSet< diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 419ebb2240ad..e72b3f4fde63 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -173,6 +173,9 @@ static StringRef intrinsicToAttrName(Intrinsic::ID ID, case Intrinsic::amdgcn_implicitarg_ptr: return "amdgpu-implicitarg-ptr"; case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: + // TODO: Does not require queue ptr on gfx9+ case Intrinsic::trap: case Intrinsic::debugtrap: IsQueuePtr = true; @@ -194,18 +197,12 @@ static bool handleAttr(Function &Parent, const Function &Callee, static void copyFeaturesToFunction(Function &Parent, const Function &Callee, bool &NeedQueuePtr) { // X ids unnecessarily propagated to kernels. - static const StringRef AttrNames[] = { - { "amdgpu-work-item-id-x" }, - { "amdgpu-work-item-id-y" }, - { "amdgpu-work-item-id-z" }, - { "amdgpu-work-group-id-x" }, - { "amdgpu-work-group-id-y" }, - { "amdgpu-work-group-id-z" }, - { "amdgpu-dispatch-ptr" }, - { "amdgpu-dispatch-id" }, - { "amdgpu-kernarg-segment-ptr" }, - { "amdgpu-implicitarg-ptr" } - }; + static constexpr StringLiteral AttrNames[] = { + "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", + "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", + "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", + "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", + "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"}; if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) NeedQueuePtr = true; diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 097730441ed8..f0e7ee910f95 100644 --- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -48,8 +48,8 @@ public: return ArgDescriptor(Reg, Mask, false, true); } - static ArgDescriptor createStack(Register Reg, unsigned Mask = ~0u) { - return ArgDescriptor(Reg, Mask, true, true); + static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { + return ArgDescriptor(Offset, Mask, true, true); } static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 743ac64b8f10..f2d903c8e7b1 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -229,7 +229,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { // alignment. Streamer.EmitValueToAlignment(64, 0, 1, 0); if (ReadOnlySection.getAlignment() < 64) - ReadOnlySection.setAlignment(64); + ReadOnlySection.setAlignment(Align(64)); const MCSubtargetInfo &STI = MF->getSubtarget(); @@ -273,7 +273,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { AsmPrinter::EmitFunctionEntryLabel(); } -void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { +void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) { if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { // Write a line for the basic block label if it is not only fallthrough. DisasmLines.push_back( @@ -342,6 +342,8 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { // Print comments that apply to both callable functions and entry points. void AMDGPUAsmPrinter::emitCommonFunctionComments( uint32_t NumVGPR, + Optional<uint32_t> NumAGPR, + uint32_t TotalNumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, @@ -349,6 +351,11 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments( OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); + if (NumAGPR) { + OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false); + OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR), + false); + } OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), false); @@ -417,7 +424,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // The starting address of all shader programs must be 256 bytes aligned. // Regular functions just need the basic required instruction alignment. - MF.setAlignment(MFI->isEntryFunction() ? 8 : 2); + MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); SetupMachineFunction(MF); @@ -474,6 +481,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; emitCommonFunctionComments( Info.NumVGPR, + STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(), + Info.getTotalNumVGPRs(STM), Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()), Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI); @@ -481,7 +490,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { } OutStreamer->emitRawComment(" Kernel info:", false); - emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, + emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR, + STM.hasMAIInsts() + ? CurrentProgramInfo.NumAccVGPR + : Optional<uint32_t>(), + CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); @@ -507,6 +520,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); OutStreamer->emitRawComment( + " Occupancy: " + + Twine(CurrentProgramInfo.Occupancy), false); + + OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); OutStreamer->emitRawComment( @@ -588,6 +605,11 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( UsesVCC, UsesFlatScratch); } +int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( + const GCNSubtarget &ST) const { + return std::max(NumVGPR, NumAGPR); +} + AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( const MachineFunction &MF) const { SIFunctionResourceInfo Info; @@ -634,11 +656,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( HighestVGPRReg = Reg; break; } - MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg); - if (MRI.isPhysRegUsed(AReg)) { - HighestVGPRReg = AReg; - break; + } + + if (ST.hasMAIInsts()) { + MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestAGPRReg = Reg; + break; + } } + Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestAGPRReg) + 1; } MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; @@ -660,6 +689,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } int32_t MaxVGPR = -1; + int32_t MaxAGPR = -1; int32_t MaxSGPR = -1; uint64_t CalleeFrameSize = 0; @@ -669,11 +699,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( for (const MachineOperand &MO : MI.operands()) { unsigned Width = 0; bool IsSGPR = false; + bool IsAGPR = false; if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); switch (Reg) { case AMDGPU::EXEC: case AMDGPU::EXEC_LO: @@ -744,6 +775,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Width = 1; } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 1; } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && @@ -755,6 +787,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Width = 2; } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 2; } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { IsSGPR = false; @@ -771,6 +804,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Width = 4; } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 4; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && @@ -790,6 +824,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Width = 16; } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 16; } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { IsSGPR = true; @@ -799,6 +834,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Width = 32; } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { IsSGPR = false; + IsAGPR = true; Width = 32; } else { llvm_unreachable("Unknown register class"); @@ -807,6 +843,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( int MaxUsed = HWReg + Width - 1; if (IsSGPR) { MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; + } else if (IsAGPR) { + MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; } else { MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; } @@ -828,6 +866,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( 47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace()); MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); MaxVGPR = std::max(MaxVGPR, 23); + MaxAGPR = std::max(MaxAGPR, 23); CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384)); Info.UsesVCC = true; @@ -852,6 +891,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); + MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); CalleeFrameSize = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); Info.UsesVCC |= I->second.UsesVCC; @@ -868,6 +908,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.NumExplicitSGPR = MaxSGPR + 1; Info.NumVGPR = MaxVGPR + 1; + Info.NumAGPR = MaxAGPR + 1; Info.PrivateSegmentSize += CalleeFrameSize; return Info; @@ -876,8 +917,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) { SIFunctionResourceInfo Info = analyzeResourceUsage(MF); + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); - ProgInfo.NumVGPR = Info.NumVGPR; + ProgInfo.NumArchVGPR = Info.NumVGPR; + ProgInfo.NumAccVGPR = Info.NumAGPR; + ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM); ProgInfo.NumSGPR = Info.NumExplicitSGPR; ProgInfo.ScratchSize = Info.PrivateSegmentSize; ProgInfo.VCCUsed = Info.UsesVCC; @@ -890,7 +934,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, MF.getFunction().getContext().diagnose(DiagStackSize); } - const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are @@ -1057,6 +1100,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | S_00B84C_EXCP_EN(0); + + ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize, + ProgInfo.NumSGPRsForWavesPerEU, + ProgInfo.NumVGPRsForWavesPerEU); } static unsigned getRsrcReg(CallingConv::ID CallConv) { @@ -1214,17 +1261,16 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (STM.isXNACKEnabled()) Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; - unsigned MaxKernArgAlign; + Align MaxKernArgAlign; Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; - // These alignment values are specified in powers of two, so alignment = - // 2^n. The minimum alignment is 2^4 = 16. - Out.kernarg_segment_alignment = std::max<size_t>(4, - countTrailingZeros(MaxKernArgAlign)); + // kernarg_segment_alignment is specified as log of the alignment. + // The minimum alignment is 16. + Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign)); } bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index cf77034329ef..c50c19a4609c 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -43,6 +43,7 @@ private: // Track the number of explicitly used VGPRs. Special registers reserved at // the end are tracked separately. int32_t NumVGPR = 0; + int32_t NumAGPR = 0; int32_t NumExplicitSGPR = 0; uint64_t PrivateSegmentSize = 0; bool UsesVCC = false; @@ -51,6 +52,7 @@ private: bool HasRecursion = false; int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; + int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; }; SIProgramInfo CurrentProgramInfo; @@ -77,6 +79,8 @@ private: void EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &KernelInfo); void emitCommonFunctionComments(uint32_t NumVGPR, + Optional<uint32_t> NumAGPR, + uint32_t TotalNumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, @@ -125,7 +129,7 @@ public: void EmitFunctionEntryLabel() override; - void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override; + void EmitBasicBlockStart(const MachineBasicBlock &MBB) override; void EmitGlobalVariable(const GlobalVariable *GV) override; @@ -140,8 +144,8 @@ public: const char *ExtraCode, raw_ostream &O) override; protected: - mutable std::vector<std::string> DisasmLines, HexLines; - mutable size_t DisasmLineMaxLen; + std::vector<std::string> DisasmLines, HexLines; + size_t DisasmLineMaxLen; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 8a92e7d923fb..ba8343142c63 100644 --- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "SIDefines.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" @@ -24,20 +25,10 @@ #define DEBUG_TYPE "amdgpu-atomic-optimizer" using namespace llvm; +using namespace llvm::AMDGPU; namespace { -enum DPP_CTRL { - DPP_ROW_SR1 = 0x111, - DPP_ROW_SR2 = 0x112, - DPP_ROW_SR3 = 0x113, - DPP_ROW_SR4 = 0x114, - DPP_ROW_SR8 = 0x118, - DPP_WF_SR1 = 0x138, - DPP_ROW_BCAST15 = 0x142, - DPP_ROW_BCAST31 = 0x143 -}; - struct ReplacementInfo { Instruction *I; AtomicRMWInst::BinOp Op; @@ -52,9 +43,12 @@ private: const LegacyDivergenceAnalysis *DA; const DataLayout *DL; DominatorTree *DT; - bool HasDPP; + const GCNSubtarget *ST; bool IsPixelShader; + Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const; + Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; @@ -93,8 +87,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { DT = DTW ? &DTW->getDomTree() : nullptr; const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); const TargetMachine &TM = TPC.getTM<TargetMachine>(); - const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); - HasDPP = ST.hasDPP(); + ST = &TM.getSubtarget<GCNSubtarget>(F); IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; visit(F); @@ -142,17 +135,18 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { // If the pointer operand is divergent, then each lane is doing an atomic // operation on a different address, and we cannot optimize that. - if (DA->isDivergent(I.getOperand(PtrIdx))) { + if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) { return; } - const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); + const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if // we have DPP available on our subtarget, and the atomic operation is 32 // bits. - if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { + if (ValDivergent && + (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { return; } @@ -219,20 +213,21 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { const unsigned ValIdx = 0; - const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); + const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if // we have DPP available on our subtarget, and the atomic operation is 32 // bits. - if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { + if (ValDivergent && + (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { return; } // If any of the other arguments to the intrinsic are divergent, we can't // optimize the operation. for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { - if (DA->isDivergent(I.getOperand(Idx))) { + if (DA->isDivergentUse(&I.getOperandUse(Idx))) { return; } } @@ -282,6 +277,111 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, return B.CreateSelect(Cond, LHS, RHS); } +// Use the builder to create an inclusive scan of V across the wavefront, with +// all lanes active. +Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *V, Value *const Identity) const { + Type *const Ty = V->getType(); + Module *M = B.GetInsertBlock()->getModule(); + Function *UpdateDPP = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Function *PermLaneX16 = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {}); + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + + for (unsigned Idx = 0; Idx < 4; Idx++) { + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); + } + if (ST->hasDPPBroadcasts()) { + // GFX9 has DPP row broadcast operations. + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), + B.getInt32(0xf), B.getFalse()})); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), + B.getInt32(0xf), B.getFalse()})); + } else { + // On GFX10 all DPP operations are confined to a single row. To get cross- + // row operations we have to use permlane or readlane. + + // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes + // 48..63). + Value *const PermX = + B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1), + B.getFalse(), B.getFalse()}); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), + B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); + if (!ST->isWave32()) { + // Combine lane 31 into lanes 32..63. + Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)}); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), + B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); + } + } + return V; +} + +// Use the builder to create a shift right of V across the wavefront, with all +// lanes active, to turn an inclusive scan into an exclusive scan. +Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, + Value *const Identity) const { + Type *const Ty = V->getType(); + Module *M = B.GetInsertBlock()->getModule(); + Function *UpdateDPP = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Function *WriteLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + + if (ST->hasDPPWavefrontShifts()) { + // GFX9 has DPP wavefront shift operations. + V = B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), + B.getInt32(0xf), B.getFalse()}); + } else { + // On GFX10 all DPP operations are confined to a single row. To get cross- + // row operations we have to use permlane or readlane. + Value *Old = V; + V = B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); + + // Copy the old lane 15 to the new lane 16. + V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), + B.getInt32(16), V}); + + if (!ST->isWave32()) { + // Copy the old lane 31 to the new lane 32. + V = B.CreateCall( + WriteLane, + {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); + + // Copy the old lane 47 to the new lane 48. + V = B.CreateCall( + WriteLane, + {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); + } + } + + return V; +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -345,23 +445,29 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // We need to know how many lanes are active within the wavefront, and we do // this by doing a ballot of active lanes. + Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); CallInst *const Ballot = B.CreateIntrinsic( - Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()}, + Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()}, {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)}); // We need to know how many lanes are active within the wavefront that are // below us. If we counted each lane linearly starting from 0, a lane is // below us only if its associated index was less than ours. We do this by // using the mbcnt intrinsic. - Value *const BitCast = B.CreateBitCast(Ballot, VecTy); - Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); - Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); - CallInst *const PartialMbcnt = B.CreateIntrinsic( - Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); - Value *const Mbcnt = - B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, - {ExtractHi, PartialMbcnt}), - Ty, false); + Value *Mbcnt; + if (ST->isWave32()) { + Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, + {Ballot, B.getInt32(0)}); + } else { + Value *const BitCast = B.CreateBitCast(Ballot, VecTy); + Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); + Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); + Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, + {ExtractLo, B.getInt32(0)}); + Mbcnt = + B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); + } + Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); @@ -373,47 +479,25 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, if (ValDivergent) { // First we need to set all inactive invocations to the identity value, so // that they can correctly contribute to the final result. - CallInst *const SetInactive = - B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - - CallInst *const FirstDPP = - B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty, - {Identity, SetInactive, B.getInt32(DPP_WF_SR1), - B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); - ExclScan = FirstDPP; - - const unsigned Iters = 7; - const unsigned DPPCtrl[Iters] = { - DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4, - DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31}; - const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; - const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf}; - - // This loop performs an exclusive scan across the wavefront, with all lanes - // active (by using the WWM intrinsic). - for (unsigned Idx = 0; Idx < Iters; Idx++) { - Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan; - CallInst *const DPP = B.CreateIntrinsic( - Intrinsic::amdgcn_update_dpp, Ty, - {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]), - B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()}); - - ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP); - } + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan); + const AtomicRMWInst::BinOp ScanOp = + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + NewV = buildScan(B, ScanOp, NewV, Identity); + ExclScan = buildShiftRight(B, NewV, Identity); // Read the value from the last lane, which has accumlated the values of // each active lane in the wavefront. This will be our new value which we // will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); if (TyBitWidth == 64) { Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); + B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty()); CallInst *const ReadLaneLo = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); + Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx}); CallInst *const ReadLaneHi = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); + Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx}); Value *const PartialInsert = B.CreateInsertElement( UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); Value *const Insert = @@ -421,7 +505,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, NewV = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, B.getInt32(63)}); + {NewV, LastLaneIdx}); } else { llvm_unreachable("Unhandled atomic bit width"); } @@ -493,77 +577,80 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // original instruction. B.SetInsertPoint(&I); - // Create a PHI node to get our new atomic result into the exit block. - PHINode *const PHI = B.CreatePHI(Ty, 2); - PHI->addIncoming(UndefValue::get(Ty), EntryBB); - PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); - - // We need to broadcast the value who was the lowest active lane (the first - // lane) to all other lanes in the wavefront. We use an intrinsic for this, - // but have to handle 64-bit broadcasts with two calls to this intrinsic. - Value *BroadcastI = nullptr; - - if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); - CallInst *const ReadFirstLaneLo = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); - CallInst *const ReadFirstLaneHi = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); - Value *const PartialInsert = B.CreateInsertElement( - UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); - BroadcastI = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { - - BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); - } else { - llvm_unreachable("Unhandled atomic bit width"); - } + const bool NeedResult = !I.use_empty(); + if (NeedResult) { + // Create a PHI node to get our new atomic result into the exit block. + PHINode *const PHI = B.CreatePHI(Ty, 2); + PHI->addIncoming(UndefValue::get(Ty), EntryBB); + PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); - // Now that we have the result of our single atomic operation, we need to - // get our individual lane's slice into the result. We use the lane offset we - // previously calculated combined with the atomic result value we got from the - // first lane, to get our lane's index into the atomic result. - Value *LaneOffset = nullptr; - if (ValDivergent) { - LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); - } else { - switch (Op) { - default: - llvm_unreachable("Unhandled atomic op"); - case AtomicRMWInst::Add: - case AtomicRMWInst::Sub: - LaneOffset = B.CreateMul(V, Mbcnt); - break; - case AtomicRMWInst::And: - case AtomicRMWInst::Or: - case AtomicRMWInst::Max: - case AtomicRMWInst::Min: - case AtomicRMWInst::UMax: - case AtomicRMWInst::UMin: - LaneOffset = B.CreateSelect(Cond, Identity, V); - break; - case AtomicRMWInst::Xor: - LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1)); - break; + // We need to broadcast the value who was the lowest active lane (the first + // lane) to all other lanes in the wavefront. We use an intrinsic for this, + // but have to handle 64-bit broadcasts with two calls to this intrinsic. + Value *BroadcastI = nullptr; + + if (TyBitWidth == 64) { + Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); + Value *const ExtractHi = + B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); + CallInst *const ReadFirstLaneLo = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); + CallInst *const ReadFirstLaneHi = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); + Value *const PartialInsert = B.CreateInsertElement( + UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); + Value *const Insert = + B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); + BroadcastI = B.CreateBitCast(Insert, Ty); + } else if (TyBitWidth == 32) { + + BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); + } else { + llvm_unreachable("Unhandled atomic bit width"); } - } - Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); - if (IsPixelShader) { - // Need a final PHI to reconverge to above the helper lane branch mask. - B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); + // Now that we have the result of our single atomic operation, we need to + // get our individual lane's slice into the result. We use the lane offset + // we previously calculated combined with the atomic result value we got + // from the first lane, to get our lane's index into the atomic result. + Value *LaneOffset = nullptr; + if (ValDivergent) { + LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); + } else { + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + LaneOffset = B.CreateMul(V, Mbcnt); + break; + case AtomicRMWInst::And: + case AtomicRMWInst::Or: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + LaneOffset = B.CreateSelect(Cond, Identity, V); + break; + case AtomicRMWInst::Xor: + LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1)); + break; + } + } + Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); - PHINode *const PHI = B.CreatePHI(Ty, 2); - PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); - PHI->addIncoming(Result, I.getParent()); - I.replaceAllUsesWith(PHI); - } else { - // Replace the original atomic instruction with the new one. - I.replaceAllUsesWith(Result); + if (IsPixelShader) { + // Need a final PHI to reconverge to above the helper lane branch mask. + B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); + + PHINode *const PHI = B.CreatePHI(Ty, 2); + PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); + PHI->addIncoming(Result, I.getParent()); + I.replaceAllUsesWith(PHI); + } else { + // Replace the original atomic instruction with the new one. + I.replaceAllUsesWith(Result); + } } // And delete the original. diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index b107c357196d..58c44acde1a7 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -30,13 +30,15 @@ using namespace llvm; namespace { -struct OutgoingArgHandler : public CallLowering::ValueHandler { - OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - MachineInstrBuilder MIB, CCAssignFn *AssignFn) - : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} +struct OutgoingValueHandler : public CallLowering::ValueHandler { + OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, + MachineInstrBuilder MIB, CCAssignFn *AssignFn) + : ValueHandler(B, MRI, AssignFn), MIB(MIB) {} MachineInstrBuilder MIB; + bool isIncomingArgumentHandler() const override { return false; } + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { llvm_unreachable("not implemented"); @@ -49,15 +51,96 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { void assignValueToReg(Register ValVReg, Register PhysReg, CCValAssign &VA) override { - MIB.addUse(PhysReg); - MIRBuilder.buildCopy(PhysReg, ValVReg); + Register ExtReg; + if (VA.getLocVT().getSizeInBits() < 32) { + // 16-bit types are reported as legal for 32-bit registers. We need to + // extend and do a 32-bit copy to avoid the verifier complaining about it. + ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); + } else + ExtReg = extendRegister(ValVReg, VA); + + MIRBuilder.buildCopy(PhysReg, ExtReg); + MIB.addUse(PhysReg, RegState::Implicit); } bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, + ISD::ArgFlagsTy Flags, CCState &State) override { - return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); + } +}; + +struct IncomingArgHandler : public CallLowering::ValueHandler { + uint64_t StackUsed = 0; + + IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : ValueHandler(B, MRI, AssignFn) {} + + Register getStackAddress(uint64_t Size, int64_t Offset, + MachinePointerInfo &MPO) override { + auto &MFI = MIRBuilder.getMF().getFrameInfo(); + int FI = MFI.CreateFixedObject(Size, Offset, true); + MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); + Register AddrReg = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32)); + MIRBuilder.buildFrameIndex(AddrReg, FI); + StackUsed = std::max(StackUsed, Size + Offset); + return AddrReg; + } + + void assignValueToReg(Register ValVReg, Register PhysReg, + CCValAssign &VA) override { + markPhysRegUsed(PhysReg); + + if (VA.getLocVT().getSizeInBits() < 32) { + // 16-bit types are reported as legal for 32-bit registers. We need to do + // a 32-bit copy, and truncate to avoid the verifier complaining about it. + auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); + MIRBuilder.buildTrunc(ValVReg, Copy); + return; + } + + switch (VA.getLocInfo()) { + case CCValAssign::LocInfo::SExt: + case CCValAssign::LocInfo::ZExt: + case CCValAssign::LocInfo::AExt: { + auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); + MIRBuilder.buildTrunc(ValVReg, Copy); + break; + } + default: + MIRBuilder.buildCopy(ValVReg, PhysReg); + break; + } + } + + void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, + MachinePointerInfo &MPO, CCValAssign &VA) override { + // FIXME: Get alignment + auto MMO = MIRBuilder.getMF().getMachineMemOperand( + MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1); + MIRBuilder.buildLoad(ValVReg, Addr, *MMO); + } + + /// How the physical register gets marked varies between formal + /// parameters (it's a basic-block live-in), and a call instruction + /// (it's an implicit-def of the BL). + virtual void markPhysRegUsed(unsigned PhysReg) = 0; + + // FIXME: What is the point of this being a callback? + bool isIncomingArgumentHandler() const override { return true; } +}; + +struct FormalArgHandler : public IncomingArgHandler { + FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : IncomingArgHandler(B, MRI, AssignFn) {} + + void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMBB().addLiveIn(PhysReg); } }; @@ -67,55 +150,198 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) : CallLowering(&TLI) { } -bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, +void AMDGPUCallLowering::splitToValueTypes( + const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv, + SplitArgTy PerformArgSplit) const { + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + LLVMContext &Ctx = OrigArg.Ty->getContext(); + + if (OrigArg.Ty->isVoidTy()) + return; + + SmallVector<EVT, 4> SplitVTs; + ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); + + assert(OrigArg.Regs.size() == SplitVTs.size()); + + int SplitIdx = 0; + for (EVT VT : SplitVTs) { + unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); + Type *Ty = VT.getTypeForEVT(Ctx); + + + + if (NumParts == 1) { + // No splitting to do, but we want to replace the original type (e.g. [1 x + // double] -> double). + SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty, + OrigArg.Flags, OrigArg.IsFixed); + + ++SplitIdx; + continue; + } + + LLT LLTy = getLLTForType(*Ty, DL); + + SmallVector<Register, 8> SplitRegs; + + EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); + Type *PartTy = PartVT.getTypeForEVT(Ctx); + LLT PartLLT = getLLTForType(*PartTy, DL); + + // FIXME: Should we be reporting all of the part registers for a single + // argument, and let handleAssignments take care of the repacking? + for (unsigned i = 0; i < NumParts; ++i) { + Register PartReg = MRI.createGenericVirtualRegister(PartLLT); + SplitRegs.push_back(PartReg); + SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); + } + + PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx); + + ++SplitIdx; + } +} + +// Get the appropriate type to make \p OrigTy \p Factor times bigger. +static LLT getMultipleType(LLT OrigTy, int Factor) { + if (OrigTy.isVector()) { + return LLT::vector(OrigTy.getNumElements() * Factor, + OrigTy.getElementType()); + } + + return LLT::scalar(OrigTy.getSizeInBits() * Factor); +} + +// TODO: Move to generic code +static void unpackRegsToOrigType(MachineIRBuilder &B, + ArrayRef<Register> DstRegs, + Register SrcReg, + LLT SrcTy, + LLT PartTy) { + assert(DstRegs.size() > 1 && "Nothing to unpack"); + + MachineFunction &MF = B.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + const unsigned SrcSize = SrcTy.getSizeInBits(); + const unsigned PartSize = PartTy.getSizeInBits(); + + if (SrcTy.isVector() && !PartTy.isVector() && + PartSize > SrcTy.getElementType().getSizeInBits()) { + // Vector was scalarized, and the elements extended. + auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), + SrcReg); + for (int i = 0, e = DstRegs.size(); i != e; ++i) + B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); + return; + } + + if (SrcSize % PartSize == 0) { + B.buildUnmerge(DstRegs, SrcReg); + return; + } + + const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize; + + LLT BigTy = getMultipleType(PartTy, NumRoundedParts); + auto ImpDef = B.buildUndef(BigTy); + + Register BigReg = MRI.createGenericVirtualRegister(BigTy); + B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0); + + int64_t Offset = 0; + for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize) + B.buildExtract(DstRegs[i], BigReg, Offset); +} + +/// Lower the return value for the already existing \p Ret. This assumes that +/// \p B's insertion point is correct. +bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, + const Value *Val, ArrayRef<Register> VRegs, + MachineInstrBuilder &Ret) const { + if (!Val) + return true; + + auto &MF = B.getMF(); + const auto &F = MF.getFunction(); + const DataLayout &DL = MF.getDataLayout(); + + CallingConv::ID CC = F.getCallingConv(); + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + ArgInfo OrigRetInfo(VRegs, Val->getType()); + setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); + SmallVector<ArgInfo, 4> SplitRetInfos; + + splitToValueTypes( + OrigRetInfo, SplitRetInfos, DL, MRI, CC, + [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) { + unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT); + }); + + CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); + + OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn); + return handleAssignments(B, SplitRetInfos, RetHandler); +} + +bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, ArrayRef<Register> VRegs) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineFunction &MF = B.getMF(); MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MFI->setIfReturnsVoid(!Val); - if (!Val) { - MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0); + assert(!Val == VRegs.empty() && "Return value without a vreg"); + + CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); + const bool IsShader = AMDGPU::isShader(CC); + const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) || + AMDGPU::isKernel(CC); + if (IsWaveEnd) { + B.buildInstr(AMDGPU::S_ENDPGM) + .addImm(0); return true; } - Register VReg = VRegs[0]; - - const Function &F = MF.getFunction(); - auto &DL = F.getParent()->getDataLayout(); - if (!AMDGPU::isShader(F.getCallingConv())) - return false; + auto const &ST = B.getMF().getSubtarget<GCNSubtarget>(); + unsigned ReturnOpc = + IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; - const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); - SmallVector<EVT, 4> SplitVTs; - SmallVector<uint64_t, 4> Offsets; - ArgInfo OrigArg{VReg, Val->getType()}; - setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); - ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); - - SmallVector<ArgInfo, 8> SplitArgs; - CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false); - for (unsigned i = 0, e = Offsets.size(); i != e; ++i) { - Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext()); - SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed}); + auto Ret = B.buildInstrNoInsert(ReturnOpc); + Register ReturnAddrVReg; + if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { + ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); + Ret.addUse(ReturnAddrVReg); } - auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG); - OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn); - if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + + if (!lowerReturnVal(B, Val, VRegs, Ret)) return false; - MIRBuilder.insertInstr(RetInstr); + if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), + &AMDGPU::SGPR_64RegClass); + B.buildCopy(ReturnAddrVReg, LiveInReturn); + } + + // TODO: Handle CalleeSavedRegsViaCopy. + + B.insertInstr(Ret); return true; } -Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, +Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineFunction &MF = B.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); @@ -128,79 +354,37 @@ Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); - MIRBuilder.buildConstant(OffsetReg, Offset); + B.buildConstant(OffsetReg, Offset); - MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); + B.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); return DstReg; } -void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, +void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset, unsigned Align, Register DstReg) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineFunction &MF = B.getMF(); const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); unsigned TypeSize = DL.getTypeStoreSize(ParamTy); - Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); + Register PtrReg = lowerParameterPtr(B, ParamTy, Offset); MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | - MachineMemOperand::MONonTemporal | + MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, TypeSize, Align); - MIRBuilder.buildLoad(DstReg, PtrReg, *MMO); -} - -static Register findFirstFreeSGPR(CCState &CCInfo) { - unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { - if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { - return AMDGPU::SGPR0 + Reg; - } - } - llvm_unreachable("Cannot allocate sgpr"); -} - -static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { - const LLT S32 = LLT::scalar(32); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - if (Info.hasWorkItemIDX()) { - Register Reg = AMDGPU::VGPR0; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); - } - - if (Info.hasWorkItemIDY()) { - Register Reg = AMDGPU::VGPR1; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); - } - - if (Info.hasWorkItemIDZ()) { - Register Reg = AMDGPU::VGPR2; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); - } + B.buildLoad(DstReg, PtrReg, *MMO); } // Allocate special inputs passed in user SGPRs. static void allocateHSAUserSGPRs(CCState &CCInfo, - MachineIRBuilder &MIRBuilder, + MachineIRBuilder &B, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { @@ -229,8 +413,8 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); Register VReg = MRI.createGenericVirtualRegister(P4); MRI.addLiveIn(InputPtrReg, VReg); - MIRBuilder.getMBB().addLiveIn(InputPtrReg); - MIRBuilder.buildCopy(VReg, InputPtrReg); + B.getMBB().addLiveIn(InputPtrReg); + B.buildCopy(VReg, InputPtrReg); CCInfo.AllocateReg(InputPtrReg); } @@ -250,74 +434,22 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, // these from the dispatch pointer. } -static void allocateSystemSGPRs(CCState &CCInfo, - MachineFunction &MF, - SIMachineFunctionInfo &Info, - CallingConv::ID CallConv, - bool IsShader) { - const LLT S32 = LLT::scalar(32); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - if (Info.hasWorkGroupIDX()) { - Register Reg = Info.addWorkGroupIDX(); - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupIDY()) { - Register Reg = Info.addWorkGroupIDY(); - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupIDZ()) { - unsigned Reg = Info.addWorkGroupIDZ(); - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); - CCInfo.AllocateReg(Reg); - } - - if (Info.hasWorkGroupInfo()) { - unsigned Reg = Info.addWorkGroupInfo(); - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32); - CCInfo.AllocateReg(Reg); - } - - if (Info.hasPrivateSegmentWaveByteOffset()) { - // Scratch wave offset passed in system SGPR. - unsigned PrivateSegmentWaveByteOffsetReg; - - if (IsShader) { - PrivateSegmentWaveByteOffsetReg = - Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); - - // This is true if the scratch wave byte offset doesn't have a fixed - // location. - if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { - PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); - Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); - } - } else - PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); - - MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); - CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); - } -} - bool AMDGPUCallLowering::lowerFormalArgumentsKernel( - MachineIRBuilder &MIRBuilder, const Function &F, + MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineFunction &MF = B.getMF(); const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + const DataLayout &DL = F.getParent()->getDataLayout(); SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); - allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info); + allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); unsigned i = 0; const unsigned KernArgBaseAlign = 16; @@ -343,123 +475,242 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); - lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg); + lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg); if (OrigArgRegs.size() > 1) - unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder); + unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); ++i; } - allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); - allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); + TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); + TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); return true; } +// TODO: Move this to generic code +static void packSplitRegsToOrigType(MachineIRBuilder &B, + ArrayRef<Register> OrigRegs, + ArrayRef<Register> Regs, + LLT LLTy, + LLT PartLLT) { + if (!LLTy.isVector() && !PartLLT.isVector()) { + B.buildMerge(OrigRegs[0], Regs); + return; + } + + if (LLTy.isVector() && PartLLT.isVector()) { + assert(LLTy.getElementType() == PartLLT.getElementType()); + + int DstElts = LLTy.getNumElements(); + int PartElts = PartLLT.getNumElements(); + if (DstElts % PartElts == 0) + B.buildConcatVectors(OrigRegs[0], Regs); + else { + // Deal with v3s16 split into v2s16 + assert(PartElts == 2 && DstElts % 2 != 0); + int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts); + + LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType()); + auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs); + B.buildExtract(OrigRegs[0], RoundedConcat, 0); + } + + return; + } + + assert(LLTy.isVector() && !PartLLT.isVector()); + + LLT DstEltTy = LLTy.getElementType(); + if (DstEltTy == PartLLT) { + // Vector was trivially scalarized. + B.buildBuildVector(OrigRegs[0], Regs); + } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { + // Deal with vector with 64-bit elements decomposed to 32-bit + // registers. Need to create intermediate 64-bit elements. + SmallVector<Register, 8> EltMerges; + int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); + + assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); + + for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) { + auto Merge = B.buildMerge(DstEltTy, + Regs.take_front(PartsPerElt)); + EltMerges.push_back(Merge.getReg(0)); + Regs = Regs.drop_front(PartsPerElt); + } + + B.buildBuildVector(OrigRegs[0], EltMerges); + } else { + // Vector was split, and elements promoted to a wider type. + LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); + auto BV = B.buildBuildVector(BVType, Regs); + B.buildTrunc(OrigRegs[0], BV); + } +} + bool AMDGPUCallLowering::lowerFormalArguments( - MachineIRBuilder &MIRBuilder, const Function &F, + MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const { + CallingConv::ID CC = F.getCallingConv(); + // The infrastructure for normal calling convention lowering is essentially // useless for kernels. We want to avoid any kind of legalization or argument // splitting. - if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) - return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs); + if (CC == CallingConv::AMDGPU_KERNEL) + return lowerFormalArgumentsKernel(B, F, VRegs); - // AMDGPU_GS and AMDGP_HS are not supported yet. - if (F.getCallingConv() == CallingConv::AMDGPU_GS || - F.getCallingConv() == CallingConv::AMDGPU_HS) - return false; + const bool IsShader = AMDGPU::isShader(CC); + const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); - MachineFunction &MF = MIRBuilder.getMF(); + MachineFunction &MF = B.getMF(); + MachineBasicBlock &MBB = B.getMBB(); MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); + const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); - bool IsShader = AMDGPU::isShader(F.getCallingConv()); SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); + + if (!IsEntryFunc) { + Register ReturnAddrReg = TRI->getReturnAddressReg(MF); + Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, + &AMDGPU::SGPR_64RegClass); + MBB.addLiveIn(ReturnAddrReg); + B.buildCopy(LiveInReturn, ReturnAddrReg); + } if (Info->hasImplicitBufferPtr()) { - unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); + Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(ImplicitBufferPtrReg); } - unsigned NumArgs = F.arg_size(); - Function::const_arg_iterator CurOrigArg = F.arg_begin(); - const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); + + SmallVector<ArgInfo, 32> SplitArgs; + unsigned Idx = 0; unsigned PSInputNum = 0; - BitVector Skipped(NumArgs); - for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { - EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType()); - - // We can only hanlde simple value types at the moment. - ISD::ArgFlagsTy Flags; - assert(VRegs[i].size() == 1 && "Can't lower into more than one register"); - ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()}; - setArgFlags(OrigArg, i + 1, DL, F); - Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); - - if (F.getCallingConv() == CallingConv::AMDGPU_PS && - !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() && - PSInputNum <= 15) { - if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) { - Skipped.set(i); - ++PSInputNum; + + for (auto &Arg : F.args()) { + if (DL.getTypeStoreSize(Arg.getType()) == 0) + continue; + + const bool InReg = Arg.hasAttribute(Attribute::InReg); + + // SGPR arguments to functions not implemented. + if (!IsShader && InReg) + return false; + + if (Arg.hasAttribute(Attribute::SwiftSelf) || + Arg.hasAttribute(Attribute::SwiftError) || + Arg.hasAttribute(Attribute::Nest)) + return false; + + if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { + const bool ArgUsed = !Arg.use_empty(); + bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); + + if (!SkipArg) { + Info->markPSInputAllocated(PSInputNum); + if (ArgUsed) + Info->markPSInputEnabled(PSInputNum); + } + + ++PSInputNum; + + if (SkipArg) { + for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) + B.buildUndef(VRegs[Idx][I]); + + ++Idx; continue; } + } - Info->markPSInputAllocated(PSInputNum); - if (!CurOrigArg->use_empty()) - Info->markPSInputEnabled(PSInputNum); + ArgInfo OrigArg(VRegs[Idx], Arg.getType()); + setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F); - ++PSInputNum; + splitToValueTypes( + OrigArg, SplitArgs, DL, MRI, CC, + // FIXME: We should probably be passing multiple registers to + // handleAssignments to do this + [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) { + packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, + LLTy, PartLLT); + }); + + ++Idx; + } + + // At least one interpolation mode must be enabled or else the GPU will + // hang. + // + // Check PSInputAddr instead of PSInputEnable. The idea is that if the user + // set PSInputAddr, the user wants to enable some bits after the compilation + // based on run-time states. Since we can't know what the final PSInputEna + // will look like, so we shouldn't do anything here and the user should take + // responsibility for the correct programming. + // + // Otherwise, the following restrictions apply: + // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. + // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be + // enabled too. + if (CC == CallingConv::AMDGPU_PS) { + if ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11))) { + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->markPSInputEnabled(0); } - CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), - /*IsVarArg=*/false); - - if (ValEVT.isVector()) { - EVT ElemVT = ValEVT.getVectorElementType(); - if (!ValEVT.isSimple()) - return false; - MVT ValVT = ElemVT.getSimpleVT(); - bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, - OrigArg.Flags, CCInfo); - if (!Res) - return false; - } else { - MVT ValVT = ValEVT.getSimpleVT(); - if (!ValEVT.isSimple()) - return false; - bool Res = - AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo); - - // Fail if we don't know how to handle this type. - if (Res) - return false; + if (Subtarget.isAmdPalOS()) { + // For isAmdPalOS, the user does not enable some bits after compilation + // based on run-time states; the register values being generated here are + // the final ones set in hardware. Therefore we need to apply the + // workaround to PSInputAddr and PSInputEnable together. (The case where + // a bit is set in PSInputAddr but not PSInputEnable is where the frontend + // set up an input arg for a particular interpolation mode, but nothing + // uses that input arg. Really we should have an earlier pass that removes + // such an arg.) + unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); + if ((PsInputBits & 0x7F) == 0 || + ((PsInputBits & 0xF) == 0 && + (PsInputBits >> 11 & 1))) + Info->markPSInputEnabled( + countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); } } - Function::const_arg_iterator Arg = F.arg_begin(); - - if (F.getCallingConv() == CallingConv::AMDGPU_VS || - F.getCallingConv() == CallingConv::AMDGPU_PS) { - for (unsigned i = 0, OrigArgIdx = 0; - OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) { - if (Skipped.test(OrigArgIdx)) - continue; - assert(VRegs[OrigArgIdx].size() == 1 && - "Can't lower into more than 1 reg"); - CCValAssign &VA = ArgLocs[i++]; - MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]); - MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); - MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg()); - } + const SITargetLowering &TLI = *getTLI<SITargetLowering>(); + CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); - allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader); - return true; + if (!MBB.empty()) + B.setInstr(*MBB.begin()); + + FormalArgHandler Handler(B, MRI, AssignFn); + if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) + return false; + + if (!IsEntryFunc) { + // Special inputs come after user arguments. + TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + } + + // Start adding system SGPRs. + if (IsEntryFunc) { + TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); + } else { + CCInfo.AllocateReg(Info->getScratchRSrcReg()); + CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); + CCInfo.AllocateReg(Info->getFrameOffsetReg()); + TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } - return false; + // Move back to the end of the basic block. + B.setMBB(MBB); + + return true; } diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h index 3599659cac6a..53a562586bc0 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -20,26 +20,37 @@ namespace llvm { class AMDGPUTargetLowering; +class MachineInstrBuilder; class AMDGPUCallLowering: public CallLowering { - Register lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, + Register lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset) const; - void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, - uint64_t Offset, unsigned Align, - Register DstReg) const; + void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset, + unsigned Align, Register DstReg) const; - public: + /// A function of this type is used to perform value split action. + using SplitArgTy = std::function<void(ArrayRef<Register>, LLT, LLT, int)>; + + void splitToValueTypes(const ArgInfo &OrigArgInfo, + SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, + CallingConv::ID CallConv, + SplitArgTy SplitArg) const; + + bool lowerReturnVal(MachineIRBuilder &B, const Value *Val, + ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const; + +public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); - bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, + bool lowerReturn(MachineIRBuilder &B, const Value *Val, ArrayRef<Register> VRegs) const override; - bool lowerFormalArgumentsKernel(MachineIRBuilder &MIRBuilder, - const Function &F, + bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const; - bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, + bool lowerFormalArguments(MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const override; static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index 3688cd77542e..f8a54a61aac2 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -24,22 +24,9 @@ def CC_SI : CallingConv<[ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39, - SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47, - SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55, - SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63, - SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71, - SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79, - SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87, - SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95, - SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103, - SGPR104, SGPR105 + SGPR40, SGPR41, SGPR42, SGPR43 ]>>>, - // We have no way of referring to the generated register tuples - // here, so use a custom function. - CCIfInReg<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>, - CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>, - // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, @@ -69,15 +56,7 @@ def RetCC_SI_Shader : CallingConv<[ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39, - SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47, - SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55, - SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63, - SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71, - SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79, - SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87, - SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95, - SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103, - SGPR104, SGPR105 + SGPR40, SGPR41, SGPR42, SGPR43 ]>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. @@ -138,7 +117,6 @@ def CC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v5i32, v5f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, @@ -157,7 +135,6 @@ def RetCC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">> ]>; def CC_AMDGPU : CallingConv<[ diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index b750c6b5f6d2..1640a4a59ee2 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -55,6 +55,12 @@ static cl::opt<bool> WidenLoads( cl::ReallyHidden, cl::init(true)); +static cl::opt<bool> UseMul24Intrin( + "amdgpu-codegenprepare-mul24", + cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(true)); + class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor<AMDGPUCodeGenPrepare, bool> { const GCNSubtarget *ST = nullptr; @@ -509,7 +515,9 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { } } - I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals)); + Value *NewVal = insertValues(Builder, Ty, ResultVals); + NewVal->takeName(&I); + I.replaceAllUsesWith(NewVal); I.eraseFromParent(); return true; @@ -879,7 +887,7 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { DA->isUniform(&I) && promoteUniformOpToI32(I)) return true; - if (replaceMulWithMul24(I)) + if (UseMul24Intrin && replaceMulWithMul24(I)) return true; bool Changed = false; diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index e80797736363..61ce83b30e00 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -13,9 +13,9 @@ #include "AMDGPUFrameLowering.h" using namespace llvm; -AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, - int LAO, unsigned TransAl) - : TargetFrameLowering(D, StackAl, LAO, TransAl) { } +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, Align StackAl, + int LAO, Align TransAl) + : TargetFrameLowering(D, StackAl, LAO, TransAl) {} AMDGPUFrameLowering::~AMDGPUFrameLowering() = default; diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 48b64488303e..92e256cf2829 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -25,8 +25,8 @@ namespace llvm { /// See TargetFrameInfo for more comments. class AMDGPUFrameLowering : public TargetFrameLowering { public: - AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, - unsigned TransAl = 1); + AMDGPUFrameLowering(StackDirection D, Align StackAl, int LAO, + Align TransAl = Align::None()); ~AMDGPUFrameLowering() override; /// \returns The number of 32-bit sub-registers that are used when storing diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td index cad4c2ef404c..f2be1ca44d34 100644 --- a/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/lib/Target/AMDGPU/AMDGPUGISel.td @@ -12,10 +12,6 @@ include "AMDGPU.td" -def p0 : PtrValueType<i64, 0>; -def p1 : PtrValueType<i64, 1>; -def p4 : PtrValueType<i64, 4>; - def sd_vsrc0 : ComplexPattern<i32, 1, "">; def gi_vsrc0 : GIComplexOperandMatcher<s32, "selectVSRC0">, @@ -38,6 +34,18 @@ def gi_vop3omods : GIComplexOperandMatcher<s32, "selectVOP3OMods">, GIComplexPatternEquiv<VOP3OMods>; +def gi_vop3omods0clamp0omod : + GIComplexOperandMatcher<s32, "selectVOP3Mods0Clamp0OMod">, + GIComplexPatternEquiv<VOP3Mods0Clamp0OMod>; + +def gi_vop3opselmods0 : + GIComplexOperandMatcher<s32, "selectVOP3OpSelMods0">, + GIComplexPatternEquiv<VOP3OpSelMods0>; + +def gi_vop3opselmods : + GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">, + GIComplexPatternEquiv<VOP3OpSelMods>; + def gi_smrd_imm : GIComplexOperandMatcher<s64, "selectSmrdImm">, GIComplexPatternEquiv<SMRDImm>; @@ -50,12 +58,19 @@ def gi_smrd_sgpr : GIComplexOperandMatcher<s64, "selectSmrdSgpr">, GIComplexPatternEquiv<SMRDSgpr>; +// FIXME: Why are the atomic versions separated? def gi_flat_offset : GIComplexOperandMatcher<s64, "selectFlatOffset">, GIComplexPatternEquiv<FLATOffset>; def gi_flat_offset_signed : GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">, GIComplexPatternEquiv<FLATOffsetSigned>; +def gi_flat_atomic : + GIComplexOperandMatcher<s64, "selectFlatOffset">, + GIComplexPatternEquiv<FLATAtomic>; +def gi_flat_signed_atomic : + GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">, + GIComplexPatternEquiv<FLATSignedAtomic>; def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, @@ -64,6 +79,44 @@ def gi_mubuf_scratch_offen : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">, GIComplexPatternEquiv<MUBUFScratchOffen>; +def gi_ds_1addr_1offset : + GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">, + GIComplexPatternEquiv<DS1Addr1Offset>; + + +// Separate load nodes are defined to glue m0 initialization in +// SelectionDAG. The GISel selector can just insert m0 initialization +// directly before before selecting a glue-less load, so hide this +// distinction. + +def : GINodeEquiv<G_LOAD, AMDGPUld_glue> { + let CheckMMOIsNonAtomic = 1; +} + +def : GINodeEquiv<G_STORE, AMDGPUst_glue> { + let CheckMMOIsNonAtomic = 1; +} + +def : GINodeEquiv<G_LOAD, AMDGPUatomic_ld_glue> { + bit CheckMMOIsAtomic = 1; +} + + + +def : GINodeEquiv<G_ATOMIC_CMPXCHG, atomic_cmp_swap_glue>; +def : GINodeEquiv<G_ATOMICRMW_XCHG, atomic_swap_glue>; +def : GINodeEquiv<G_ATOMICRMW_ADD, atomic_load_add_glue>; +def : GINodeEquiv<G_ATOMICRMW_SUB, atomic_load_sub_glue>; +def : GINodeEquiv<G_ATOMICRMW_AND, atomic_load_and_glue>; +def : GINodeEquiv<G_ATOMICRMW_OR, atomic_load_or_glue>; +def : GINodeEquiv<G_ATOMICRMW_XOR, atomic_load_xor_glue>; +def : GINodeEquiv<G_ATOMICRMW_MIN, atomic_load_min_glue>; +def : GINodeEquiv<G_ATOMICRMW_MAX, atomic_load_max_glue>; +def : GINodeEquiv<G_ATOMICRMW_UMIN, atomic_load_umin_glue>; +def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax_glue>; +def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>; + +def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32>; class GISelSop2Pat < SDPatternOperator node, @@ -143,20 +196,6 @@ multiclass GISelVop2IntrPat < def : GISelSop2Pat <or, S_OR_B32, i32>; def : GISelVop2Pat <or, V_OR_B32_e32, i32>; -// FIXME: We can't re-use SelectionDAG patterns here because they match -// against a custom SDNode and we would need to create a generic machine -// instruction that is equivalent to the custom SDNode. This would also require -// us to custom legalize the intrinsic to the new generic machine instruction, -// but I can't get custom legalizing of intrinsic to work and I'm not sure if -// this is even supported yet. -def : GISelVop3Pat2ModsPat < - int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e64, v2f16, f32>; - -defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>; -def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>; -defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>; -def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>; - // Since GlobalISel is more flexible then SelectionDAG, I think we can get // away with adding patterns for integer types and not legalizing all // loads and stores to vector types. This should help simplify the load/store @@ -164,3 +203,6 @@ def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>; foreach Ty = [i64, p0, p1, p4] in { defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>; } + +def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">, + GISDNodeXFormEquiv<as_i32timm>; diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def index 0a1f48231b18..85d1ad349157 100644 --- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -22,15 +22,17 @@ enum PartialMappingIdx { PM_SGPR128 = 9, PM_SGPR256 = 10, PM_SGPR512 = 11, - PM_VGPR1 = 12, - PM_VGPR16 = 16, - PM_VGPR32 = 17, - PM_VGPR64 = 18, - PM_VGPR128 = 19, - PM_VGPR256 = 20, - PM_VGPR512 = 21, - PM_SGPR96 = 22, - PM_VGPR96 = 23 + PM_SGPR1024 = 12, + PM_VGPR1 = 13, + PM_VGPR16 = 17, + PM_VGPR32 = 18, + PM_VGPR64 = 19, + PM_VGPR128 = 20, + PM_VGPR256 = 21, + PM_VGPR512 = 22, + PM_VGPR1024 = 23, + PM_SGPR96 = 24, + PM_VGPR96 = 25 }; const RegisterBankInfo::PartialMapping PartMappings[] { @@ -45,6 +47,7 @@ const RegisterBankInfo::PartialMapping PartMappings[] { {0, 128, SGPRRegBank}, {0, 256, SGPRRegBank}, {0, 512, SGPRRegBank}, + {0, 1024, SGPRRegBank}, {0, 1, VGPRRegBank}, // VGPR begin {0, 16, VGPRRegBank}, @@ -53,8 +56,9 @@ const RegisterBankInfo::PartialMapping PartMappings[] { {0, 128, VGPRRegBank}, {0, 256, VGPRRegBank}, {0, 512, VGPRRegBank}, + {0, 1024, VGPRRegBank}, {0, 96, SGPRRegBank}, - {0, 96, VGPRRegBank}, + {0, 96, VGPRRegBank} }; const RegisterBankInfo::ValueMapping ValMappings[] { @@ -65,41 +69,43 @@ const RegisterBankInfo::ValueMapping ValMappings[] { {&PartMappings[1], 1}, // SGPRs - {&PartMappings[2], 1}, + {&PartMappings[2], 1}, // 1 {nullptr, 0}, // Illegal power of 2 sizes {nullptr, 0}, {nullptr, 0}, - {&PartMappings[3], 1}, - {&PartMappings[4], 1}, - {&PartMappings[5], 1}, - {&PartMappings[6], 1}, - {&PartMappings[7], 1}, - {&PartMappings[8], 1}, - - // VGPRs - {&PartMappings[9], 1}, + {&PartMappings[3], 1}, // 16 + {&PartMappings[4], 1}, // 32 + {&PartMappings[5], 1}, // 64 + {&PartMappings[6], 1}, // 128 + {&PartMappings[7], 1}, // 256 + {&PartMappings[8], 1}, // 512 + {&PartMappings[9], 1}, // 1024 + + // VGPRs + {&PartMappings[10], 1}, // 1 {nullptr, 0}, {nullptr, 0}, {nullptr, 0}, - {&PartMappings[10], 1}, - {&PartMappings[11], 1}, - {&PartMappings[12], 1}, - {&PartMappings[13], 1}, - {&PartMappings[14], 1}, - {&PartMappings[15], 1}, - {&PartMappings[16], 1}, - {&PartMappings[17], 1} + {&PartMappings[11], 1}, // 16 + {&PartMappings[12], 1}, // 32 + {&PartMappings[13], 1}, // 64 + {&PartMappings[14], 1}, // 128 + {&PartMappings[15], 1}, // 256 + {&PartMappings[16], 1}, // 512 + {&PartMappings[17], 1}, // 1024 + {&PartMappings[18], 1}, + {&PartMappings[19], 1} }; const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] { - /*32-bit op*/ {0, 32, SGPRRegBank}, - /*2x32-bit op*/ {0, 32, SGPRRegBank}, - {32, 32, SGPRRegBank}, -/*<2x32-bit> op*/ {0, 64, SGPRRegBank}, - - /*32-bit op*/ {0, 32, VGPRRegBank}, - /*2x32-bit op*/ {0, 32, VGPRRegBank}, - {32, 32, VGPRRegBank}, + {0, 32, SGPRRegBank}, // 32-bit op + {0, 32, SGPRRegBank}, // 2x32-bit op + {32, 32, SGPRRegBank}, + {0, 64, SGPRRegBank}, // <2x32-bit> op + + {0, 32, VGPRRegBank}, // 32-bit op + {0, 32, VGPRRegBank}, // 2x32-bit op + {32, 32, VGPRRegBank}, }; @@ -116,7 +122,7 @@ const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] { enum ValueMappingIdx { SCCStartIdx = 0, SGPRStartIdx = 2, - VGPRStartIdx = 12 + VGPRStartIdx = 13 }; const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index b31de0af5018..9f5bcd8ff5f0 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -218,12 +218,13 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF, assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || F.getCallingConv() == CallingConv::SPIR_KERNEL); - unsigned MaxKernArgAlign; + Align MaxKernArgAlign; HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F, MaxKernArgAlign); HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; - HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u); + HSACodeProps.mKernargSegmentAlign = + std::max(MaxKernArgAlign, Align(4)).value(); HSACodeProps.mWavefrontSize = STM.getWavefrontSize(); HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR; HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR; @@ -883,7 +884,7 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF, auto Kern = HSAMetadataDoc->getMapNode(); - unsigned MaxKernArgAlign; + Align MaxKernArgAlign; Kern[".kernarg_segment_size"] = Kern.getDocument()->getNode( STM.getKernArgSegmentSize(F, MaxKernArgAlign)); Kern[".group_segment_fixed_size"] = @@ -891,7 +892,7 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF, Kern[".private_segment_fixed_size"] = Kern.getDocument()->getNode(ProgramInfo.ScratchSize); Kern[".kernarg_segment_align"] = - Kern.getDocument()->getNode(std::max(uint32_t(4), MaxKernArgAlign)); + Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value()); Kern[".wavefront_size"] = Kern.getDocument()->getNode(STM.getWavefrontSize()); Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR); diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 2eecddbd7b01..80ac8ca67bcd 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -52,7 +52,7 @@ public: class MetadataStreamerV3 final : public MetadataStreamer { private: std::unique_ptr<msgpack::Document> HSAMetadataDoc = - llvm::make_unique<msgpack::Document>(); + std::make_unique<msgpack::Document>(); void dump(StringRef HSAMetadataString) const; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ea730539f834..f330bd7ebcdd 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -172,8 +172,9 @@ private: MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; - SDNode *glueCopyToM0LDSInit(SDNode *N) const; + SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; + SDNode *glueCopyToM0LDSInit(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); @@ -186,10 +187,11 @@ private: bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE, SDValue &DLC) const; + SDValue &SLC, SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; @@ -202,21 +204,20 @@ private: bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; + template <bool IsSigned> + bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, + SDValue &Offset, SDValue &SLC) const; bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; - template <bool IsSigned> - bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, - SDValue &Offset, SDValue &SLC) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; SDValue Expand32BitAddress(SDValue Addr) const; @@ -262,6 +263,8 @@ private: SDValue getHi16Elt(SDValue In) const; + SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const; + void SelectADD_SUB_I64(SDNode *N); void SelectAddcSubb(SDNode *N); void SelectUADDO_USUBO(SDNode *N); @@ -282,6 +285,7 @@ private: void SelectDSAppendConsume(SDNode *N, unsigned IntrID); void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectINTRINSIC_W_CHAIN(SDNode *N); + void SelectINTRINSIC_WO_CHAIN(SDNode *N); void SelectINTRINSIC_VOID(SDNode *N); protected: @@ -543,7 +547,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, if (!N->isMachineOpcode()) { if (N->getOpcode() == ISD::CopyToReg) { unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo(); return MRI.getRegClass(Reg); } @@ -582,19 +586,10 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, } } -SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { - const SITargetLowering& Lowering = - *static_cast<const SITargetLowering*>(getTargetLowering()); - - assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); - - SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), - Val); - - SDValue Glue = M0.getValue(1); - +SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain, + SDValue Glue) const { SmallVector <SDValue, 8> Ops; - Ops.push_back(M0); // Replace the chain. + Ops.push_back(NewChain); // Replace the chain. for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) Ops.push_back(N->getOperand(i)); @@ -602,6 +597,16 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); + + SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val); + return glueCopyToOp(N, M0, M0.getValue(1)); +} + SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { unsigned AS = cast<MemSDNode>(N)->getAddressSpace(); if (AS == AMDGPUAS::LOCAL_ADDRESS) { @@ -635,13 +640,13 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { switch (NumVectorElts) { case 1: - return AMDGPU::SReg_32_XM0RegClassID; + return AMDGPU::SReg_32RegClassID; case 2: return AMDGPU::SReg_64RegClassID; case 3: return AMDGPU::SGPR_96RegClassID; case 4: - return AMDGPU::SReg_128RegClassID; + return AMDGPU::SGPR_128RegClassID; case 5: return AMDGPU::SGPR_160RegClassID; case 8: @@ -713,12 +718,17 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { return; // Already selected. } - if (isa<AtomicSDNode>(N) || + // isa<MemSDNode> almost works but is slightly too permissive for some DS + // intrinsics. + if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) || (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || Opc == ISD::ATOMIC_LOAD_FADD || Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { N = glueCopyToM0LDSInit(N); + SelectCode(N); + return; + } switch (Opc) { default: @@ -781,7 +791,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SDValue RC, SubReg0, SubReg1; SDLoc DL(N); if (N->getValueType(0) == MVT::i128) { - RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); + RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32); SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); } else if (N->getValueType(0) == MVT::i64) { @@ -815,14 +825,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); return; } - case ISD::LOAD: - case ISD::STORE: - case ISD::ATOMIC_LOAD: - case ISD::ATOMIC_STORE: { - N = glueCopyToM0LDSInit(N); - break; - } - case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { // There is a scalar version available, but unlike the vector version which @@ -908,6 +910,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectINTRINSIC_W_CHAIN(N); return; } + case ISD::INTRINSIC_WO_CHAIN: { + SelectINTRINSIC_WO_CHAIN(N); + return; + } case ISD::INTRINSIC_VOID: { SelectINTRINSIC_VOID(N); return; @@ -961,6 +967,14 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } +SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val, + const SDLoc &DL) const { + SDNode *Mov = CurDAG->getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(Val, DL, MVT::i32)); + return SDValue(Mov, 0); +} + // FIXME: Should only handle addcarry/subcarry void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDLoc DL(N); @@ -1308,7 +1322,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const { + SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const { // Subtarget prefers to use flat instruction if (Subtarget->useFlatForGlobal()) return false; @@ -1321,6 +1336,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1400,7 +1416,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE, - SDValue &DLC) const { + SDValue &DLC, SDValue &SWZ) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. @@ -1408,7 +1424,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return false; if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC)) + GLC, SLC, TFE, DLC, SWZ)) return false; ConstantSDNode *C = cast<ConstantSDNode>(Addr64); @@ -1430,9 +1446,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &Offset, SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE, DLC; + SDValue GLC, TFE, DLC, SWZ; - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ); } static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { @@ -1557,13 +1573,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const { + SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC)) + GLC, SLC, TFE, DLC, SWZ)) return false; if (!cast<ConstantSDNode>(Offen)->getSExtValue() && @@ -1585,16 +1602,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset ) const { - SDValue GLC, SLC, TFE, DLC; + SDValue GLC, SLC, TFE, DLC, SWZ; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const { - SDValue GLC, TFE, DLC; + SDValue GLC, TFE, DLC, SWZ; + + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); +} - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); +// Find a load or store from corresponding pattern root. +// Roots may be build_vector, bitconvert or their combinations. +static MemSDNode* findMemSDNode(SDNode *N) { + N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode(); + if (MemSDNode *MN = dyn_cast<MemSDNode>(N)) + return MN; + assert(isa<BuildVectorSDNode>(N)); + for (SDValue V : N->op_values()) + if (MemSDNode *MN = + dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V))) + return MN; + llvm_unreachable("cannot find MemSDNode in the pattern!"); } template <bool IsSigned> @@ -1603,8 +1634,95 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const { - return static_cast<const SITargetLowering*>(getTargetLowering())-> - SelectFlatOffset(IsSigned, *CurDAG, N, Addr, VAddr, Offset, SLC); + int64_t OffsetVal = 0; + + if (Subtarget->hasFlatInstOffsets() && + (!Subtarget->hasFlatSegmentOffsetBug() || + findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && + CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); + + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned AS = findMemSDNode(N)->getAddressSpace(); + if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { + Addr = N0; + OffsetVal = COffsetVal; + } else { + // If the offset doesn't fit, put the low bits into the offset field and + // add the rest. + + SDLoc DL(N); + uint64_t ImmField; + const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); + if (IsSigned) { + ImmField = SignExtend64(COffsetVal, NumBits); + + // Don't use a negative offset field if the base offset is positive. + // Since the scheduler currently relies on the offset field, doing so + // could result in strange scheduling decisions. + + // TODO: Should we not do this in the opposite direction as well? + if (static_cast<int64_t>(COffsetVal) > 0) { + if (static_cast<int64_t>(ImmField) < 0) { + const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1); + ImmField = COffsetVal & OffsetMask; + } + } + } else { + // TODO: Should we do this for a negative offset? + const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits); + ImmField = COffsetVal & OffsetMask; + } + + uint64_t RemainderOffset = COffsetVal - ImmField; + + assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); + assert(RemainderOffset + ImmField == COffsetVal); + + OffsetVal = ImmField; + + // TODO: Should this try to use a scalar add pseudo if the base address is + // uniform and saddr is usable? + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetLo + = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue AddOffsetHi + = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + SDNode *Add = CurDAG->getMachineNode( + AMDGPU::V_ADD_I32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1 + }; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), 0); + } + } + + VAddr = Addr; + Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); + SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); + return true; } bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N, @@ -1616,10 +1734,10 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N, } bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N, - SDValue Addr, - SDValue &VAddr, - SDValue &Offset, - SDValue &SLC) const { + SDValue Addr, + SDValue &VAddr, + SDValue &Offset, + SDValue &SLC) const { return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC); } @@ -2158,10 +2276,12 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { // offset field) % 64. Some versions of the programming guide omit the m0 // part, or claim it's from offset 0. if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) { - // If we have a constant offset, try to use the default value for m0 as a - // base to possibly avoid setting it up. - glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32)); - ImmOffset = ConstOffset->getZExtValue() + 1; + // If we have a constant offset, try to use the 0 in m0 as the base. + // TODO: Look into changing the default m0 initialization value. If the + // default -1 only set the low 16-bits, we could leave it as-is and add 1 to + // the immediate offset. + glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32)); + ImmOffset = ConstOffset->getZExtValue(); } else { if (CurDAG->isBaseWithConstantOffset(BaseOffset)) { ImmOffset = BaseOffset.getConstantOperandVal(1); @@ -2182,22 +2302,7 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { glueCopyToM0(N, SDValue(M0Base, 0)); } - SDValue V0; SDValue Chain = N->getOperand(0); - SDValue Glue; - if (HasVSrc) { - SDValue VSrc0 = N->getOperand(2); - - // The manual doesn't mention this, but it seems only v0 works. - V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32); - - SDValue CopyToV0 = CurDAG->getCopyToReg( - N->getOperand(0), SL, V0, VSrc0, - N->getOperand(N->getNumOperands() - 1)); - Chain = CopyToV0; - Glue = CopyToV0.getValue(1); - } - SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); // TODO: Can this just be removed from the instruction? @@ -2206,14 +2311,11 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { const unsigned Opc = gwsIntrinToOpcode(IntrID); SmallVector<SDValue, 5> Ops; if (HasVSrc) - Ops.push_back(V0); + Ops.push_back(N->getOperand(2)); Ops.push_back(OffsetField); Ops.push_back(GDS); Ops.push_back(Chain); - if (HasVSrc) - Ops.push_back(Glue); - SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); } @@ -2233,6 +2335,28 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { SelectCode(N); } +void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { + unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + unsigned Opcode; + switch (IntrID) { + case Intrinsic::amdgcn_wqm: + Opcode = AMDGPU::WQM; + break; + case Intrinsic::amdgcn_softwqm: + Opcode = AMDGPU::SOFT_WQM; + break; + case Intrinsic::amdgcn_wwm: + Opcode = AMDGPU::WWM; + break; + default: + SelectCode(N); + return; + } + + SDValue Src = N->getOperand(1); + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src}); +} + void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); switch (IntrID) { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 39016ed37193..1115d8c23620 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -12,10 +12,6 @@ // //===----------------------------------------------------------------------===// -#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f -#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f -#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f - #include "AMDGPUISelLowering.h" #include "AMDGPU.h" #include "AMDGPUCallLowering.h" @@ -37,82 +33,9 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; -static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, - const TargetRegisterClass *RC, - unsigned NumRegs) { - ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs); - unsigned RegResult = State.AllocateReg(RegList); - if (RegResult == AMDGPU::NoRegister) - return false; - - State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo)); - return true; -} - -static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - switch (LocVT.SimpleTy) { - case MVT::i64: - case MVT::f64: - case MVT::v2i32: - case MVT::v2f32: - case MVT::v4i16: - case MVT::v4f16: { - // Up to SGPR0-SGPR105 - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::SGPR_64RegClass, 53); - } - default: - return false; - } -} - -// Allocate up to VGPR31. -// -// TODO: Since there are no VGPR alignent requirements would it be better to -// split into individual scalar registers? -static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - switch (LocVT.SimpleTy) { - case MVT::i64: - case MVT::f64: - case MVT::v2i32: - case MVT::v2f32: - case MVT::v4i16: - case MVT::v4f16: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_64RegClass, 31); - } - case MVT::v4i32: - case MVT::v4f32: - case MVT::v2i64: - case MVT::v2f64: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_128RegClass, 29); - } - case MVT::v8i32: - case MVT::v8f32: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_256RegClass, 25); - - } - case MVT::v16i32: - case MVT::v16f32: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_512RegClass, 17); - - } - default: - return false; - } -} - #include "AMDGPUGenCallingConv.inc" // Find a larger type to do a load / store of a vector with. @@ -208,7 +131,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); } - for (MVT VT : MVT::integer_vector_valuetypes()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); @@ -218,6 +141,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); @@ -225,8 +151,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); @@ -286,8 +215,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); + setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand); + setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -571,6 +503,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FABS); setTargetDAGCombine(ISD::AssertZext); setTargetDAGCombine(ISD::AssertSext); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); } //===----------------------------------------------------------------------===// @@ -630,15 +563,26 @@ static bool hasSourceMods(const SDNode *N) { case ISD::FREM: case ISD::INLINEASM: case ISD::INLINEASM_BR: - case AMDGPUISD::INTERP_P1: - case AMDGPUISD::INTERP_P2: case AMDGPUISD::DIV_SCALE: + case ISD::INTRINSIC_W_CHAIN: // TODO: Should really be looking at the users of the bitcast. These are // problematic because bitcasts are used to legalize all stores to integer // types. case ISD::BITCAST: return false; + case ISD::INTRINSIC_WO_CHAIN: { + switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) { + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_interp_mov: + case Intrinsic::amdgcn_interp_p1_f16: + case Intrinsic::amdgcn_interp_p2_f16: + return false; + default: + return true; + } + } default: return true; } @@ -745,8 +689,9 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, return false; bool Fast = false; - return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy, - MMO, &Fast) && Fast; + return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + CastTy, MMO, &Fast) && + Fast; } // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also @@ -782,9 +727,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { break; case ISD::LOAD: { - const LoadSDNode * L = dyn_cast<LoadSDNode>(N); - if (L->getMemOperand()->getAddrSpace() - == AMDGPUAS::CONSTANT_ADDRESS_32BIT) + if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() == + AMDGPUAS::CONSTANT_ADDRESS_32BIT) return true; return false; } @@ -1199,9 +1143,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::FLOG: - return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F); + return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef); case ISD::FLOG10: - return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F); + return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f); case ISD::FEXP: return lowerFEXP(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); @@ -1236,7 +1180,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, } } -static bool hasDefinedInitializer(const GlobalValue *GV) { +bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) { const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); if (!GVar || !GVar->hasInitializer()) return false; @@ -2349,30 +2293,13 @@ SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand); } -// Return M_LOG2E of appropriate type -static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) { - switch (VT.getScalarType().getSimpleVT().SimpleTy) { - case MVT::f32: - return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT); - case MVT::f16: - return DAG.getConstantFP( - APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"), - SL, VT); - case MVT::f64: - return DAG.getConstantFP( - APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT); - default: - llvm_unreachable("unsupported fp type"); - } -} - // exp2(M_LOG2E_F * f); SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc SL(Op); SDValue Src = Op.getOperand(0); - const SDValue K = getLog2EVal(DAG, SL, VT); + const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT); SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags()); return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags()); } @@ -2836,8 +2763,16 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) { static SDValue simplifyI24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - SDValue LHS = Node24->getOperand(0); - SDValue RHS = Node24->getOperand(1); + bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; + + SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); + SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1); + unsigned NewOpcode = Node24->getOpcode(); + if (IsIntrin) { + unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue(); + NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ? + AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; + } APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); @@ -2847,7 +2782,7 @@ static SDValue simplifyI24(SDNode *Node24, SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded); SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded); if (DemandedLHS || DemandedRHS) - return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(), + return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), DemandedLHS ? DemandedLHS : LHS, DemandedRHS ? DemandedRHS : RHS); @@ -2904,54 +2839,6 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { return true; } -// Find a load or store from corresponding pattern root. -// Roots may be build_vector, bitconvert or their combinations. -static MemSDNode* findMemSDNode(SDNode *N) { - N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode(); - if (MemSDNode *MN = dyn_cast<MemSDNode>(N)) - return MN; - assert(isa<BuildVectorSDNode>(N)); - for (SDValue V : N->op_values()) - if (MemSDNode *MN = - dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V))) - return MN; - llvm_unreachable("cannot find MemSDNode in the pattern!"); -} - -bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned, - SelectionDAG &DAG, - SDNode *N, - SDValue Addr, - SDValue &VAddr, - SDValue &Offset, - SDValue &SLC) const { - const GCNSubtarget &ST = - DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); - int64_t OffsetVal = 0; - - if (ST.hasFlatInstOffsets() && - (!ST.hasFlatSegmentOffsetBug() || - findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && - DAG.isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); - - const SIInstrInfo *TII = ST.getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(), - IsSigned)) { - Addr = N0; - OffsetVal = COffsetVal; - } - } - - VAddr = Addr; - Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16); - SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1); - - return true; -} - // Replace load of an illegal type with a store of a bitcast to a friendlier // type. SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, @@ -3085,6 +2972,19 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, return SDValue(); } + +SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_mul_i24: + case Intrinsic::amdgcn_mul_u24: + return simplifyI24(N, DCI); + default: + return SDValue(); + } +} + /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( @@ -4173,6 +4073,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, case ISD::AssertZext: case ISD::AssertSext: return performAssertSZExtCombine(N, DCI); + case ISD::INTRINSIC_WO_CHAIN: + return performIntrinsicWOChainCombine(N, DCI); } return SDValue(); } @@ -4203,14 +4105,28 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); } +// This may be called multiple times, and nothing prevents creating multiple +// objects at the same offset. See if we already defined this object. +static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, + int64_t Offset) { + for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { + if (MFI.getObjectOffset(I) == Offset) { + assert(MFI.getObjectSize(I) == Size); + return I; + } + } + + return MFI.CreateFixedObject(Size, Offset, true); +} + SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); + int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset); - int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true); auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); @@ -4260,7 +4176,7 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction()); unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction()); - unsigned Alignment = ST.getAlignmentForImplicitArgPtr(); + const Align Alignment = ST.getAlignmentForImplicitArgPtr(); uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) + ExplicitArgOffset; switch (Param) { @@ -4295,6 +4211,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) NODE_NAME_CASE(SETREG) + NODE_NAME_CASE(DENORM_MODE) NODE_NAME_CASE(FMA_W_CHAIN) NODE_NAME_CASE(FMUL_W_CHAIN) NODE_NAME_CASE(CLAMP) @@ -4377,13 +4294,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; - NODE_NAME_CASE(INIT_EXEC) - NODE_NAME_CASE(INIT_EXEC_FROM_INPUT) - NODE_NAME_CASE(SENDMSG) - NODE_NAME_CASE(SENDMSGHALT) - NODE_NAME_CASE(INTERP_MOV) - NODE_NAME_CASE(INTERP_P1) - NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(INTERP_P1LL_F16) NODE_NAME_CASE(INTERP_P1LV_F16) NODE_NAME_CASE(INTERP_P2_F16) @@ -4428,6 +4338,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_AND) NODE_NAME_CASE(BUFFER_ATOMIC_OR) NODE_NAME_CASE(BUFFER_ATOMIC_XOR) + NODE_NAME_CASE(BUFFER_ATOMIC_INC) + NODE_NAME_CASE(BUFFER_ATOMIC_DEC) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) @@ -4576,9 +4488,9 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; } else if (SelBits == 0x0c) { - Known.Zero |= 0xff << I; + Known.Zero |= 0xFFull << I; } else if (SelBits > 0x0c) { - Known.One |= 0xff << I; + Known.One |= 0xFFull << I; } Sel >>= 8; } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index fe7ad694943d..dea0d1d4343a 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -38,6 +38,7 @@ private: public: static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG); static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG); + static bool hasDefinedInitializer(const GlobalValue *GV); protected: SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; @@ -78,6 +79,7 @@ protected: SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, @@ -324,10 +326,6 @@ public: } AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; - - bool SelectFlatOffset(bool IsSigned, SelectionDAG &DAG, SDNode *N, - SDValue Addr, SDValue &VAddr, SDValue &Offset, - SDValue &SLC) const; }; namespace AMDGPUISD { @@ -369,6 +367,9 @@ enum NodeType : unsigned { // result bit per item in the wavefront. SETCC, SETREG, + + DENORM_MODE, + // FP ops with input and output chain. FMA_W_CHAIN, FMUL_W_CHAIN, @@ -475,13 +476,6 @@ enum NodeType : unsigned { BUILD_VERTICAL_VECTOR, /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, - INIT_EXEC, - INIT_EXEC_FROM_INPUT, - SENDMSG, - SENDMSGHALT, - INTERP_MOV, - INTERP_P1, - INTERP_P2, INTERP_P1LL_F16, INTERP_P1LV_F16, INTERP_P2_F16, @@ -532,6 +526,8 @@ enum NodeType : unsigned { BUFFER_ATOMIC_AND, BUFFER_ATOMIC_OR, BUFFER_ATOMIC_XOR, + BUFFER_ATOMIC_INC, + BUFFER_ATOMIC_DEC, BUFFER_ATOMIC_CMPSWAP, BUFFER_ATOMIC_FADD, BUFFER_ATOMIC_PK_FADD, diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp index f4df20b8f03e..a83ec23ec054 100644 --- a/lib/Target/AMDGPU/AMDGPUInline.cpp +++ b/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -51,7 +51,7 @@ ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), // Inliner constraint to achieve reasonable compilation time static cl::opt<size_t> -MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300), +MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum BB number allowed in a function after inlining" " (compile time constraint)")); diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 4a8446955496..cf0ce5659951 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -110,39 +110,38 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; // Force dependencies for vector trunc stores def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>; -def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; -def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; - +def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; +def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; // out = a - floor(a) -def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; +def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; // out = 1.0 / a -def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; +def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) -def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; +def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) -def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>; -def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; +def AMDGPUrsq_legacy_impl : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; +def AMDGPUrcp_legacy_impl : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>; def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. -def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; +def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; -def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; +def AMDGPUldexp_impl : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; -def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; -def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; -def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; -def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; -def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; +def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; +def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; +def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; +def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; +def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>; -def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; +def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; // out = max(a, b) a and b are floats, where a nan comparison fails. // This is not commutative because this gives the second operand: @@ -285,7 +284,7 @@ def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; -def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>; +def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>; def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>; @@ -320,7 +319,7 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, [] >; -def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, @@ -330,35 +329,6 @@ def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; -def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", - SDTypeProfile<0, 1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPInGlue]>; - -def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT", - SDTypeProfile<0, 2, - [SDTCisInt<0>, SDTCisInt<1>]>, - [SDNPHasChain, SDNPInGlue]>; - -def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", - SDTypeProfile<0, 1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPInGlue]>; - -def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT", - SDTypeProfile<0, 1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPInGlue]>; - -def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV", - SDTypeProfile<1, 3, [SDTCisFP<0>]>, - [SDNPInGlue]>; - -def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1", - SDTypeProfile<1, 3, [SDTCisFP<0>]>, - [SDNPInGlue, SDNPOutGlue]>; - -def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", - SDTypeProfile<1, 4, [SDTCisFP<0>]>, - [SDNPInGlue]>; - def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16", SDTypeProfile<1, 7, [SDTCisFP<0>]>, [SDNPInGlue, SDNPOutGlue]>; @@ -425,3 +395,65 @@ def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; + + +//===----------------------------------------------------------------------===// +// Intrinsic/Custom node compatability PatFrags +//===----------------------------------------------------------------------===// + +def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src), + (AMDGPUrcp_impl node:$src)]>; +def AMDGPUrcp_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rcp_legacy node:$src), + (AMDGPUrcp_legacy_impl node:$src)]>; + +def AMDGPUrsq_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rsq_legacy node:$src), + (AMDGPUrsq_legacy_impl node:$src)]>; + +def AMDGPUrsq : PatFrags<(ops node:$src), [(int_amdgcn_rsq node:$src), + (AMDGPUrsq_impl node:$src)]>; + +def AMDGPUrsq_clamp : PatFrags<(ops node:$src), [(int_amdgcn_rsq_clamp node:$src), + (AMDGPUrsq_clamp_impl node:$src)]>; + +def AMDGPUsin : PatFrags<(ops node:$src), [(int_amdgcn_sin node:$src), + (AMDGPUsin_impl node:$src)]>; +def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src), + (AMDGPUcos_impl node:$src)]>; +def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src), + (AMDGPUfract_impl node:$src)]>; + +def AMDGPUldexp : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_ldexp node:$src0, node:$src1), + (AMDGPUldexp_impl node:$src0, node:$src1)]>; + +def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_class node:$src0, node:$src1), + (AMDGPUfp_class_impl node:$src0, node:$src1)]>; + +def AMDGPUfmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_fmed3 node:$src0, node:$src1, node:$src2), + (AMDGPUfmed3_impl node:$src0, node:$src1, node:$src2)]>; + +def AMDGPUffbh_i32 : PatFrags<(ops node:$src), + [(int_amdgcn_sffbh node:$src), + (AMDGPUffbh_i32_impl node:$src)]>; + +def AMDGPUpkrtz_f16_f32 : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_cvt_pkrtz node:$src0, node:$src1), + (AMDGPUpkrtz_f16_f32_impl node:$src0, node:$src1)]>; + +def AMDGPUpknorm_i16_f32 : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_cvt_pknorm_i16 node:$src0, node:$src1), + (AMDGPUpknorm_i16_f32_impl node:$src0, node:$src1)]>; + +def AMDGPUpknorm_u16_f32 : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_cvt_pknorm_u16 node:$src0, node:$src1), + (AMDGPUpknorm_u16_f32_impl node:$src0, node:$src1)]>; + +def AMDGPUpk_i16_i32 : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_cvt_pk_i16 node:$src0, node:$src1), + (AMDGPUpk_i16_i32_impl node:$src0, node:$src1)]>; + +def AMDGPUpk_u16_u32 : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_cvt_pk_u16 node:$src0, node:$src1), + (AMDGPUpk_u16_u32_impl node:$src0, node:$src1)]>; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 901a2eaa8829..3cfa9d57ec46 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -19,8 +19,10 @@ #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -61,8 +63,14 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector( const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } +void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB, + CodeGenCoverage &CoverageInfo) { + MRI = &MF.getRegInfo(); + InstructionSelector::setupMF(MF, KB, CoverageInfo); +} + static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return Reg == AMDGPU::SCC; auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); @@ -71,7 +79,9 @@ static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { if (RC) { // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the // context of the register bank has been lost. - if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID) + // Has a hack getRegClassForSizeOnBank uses exactly SGPR_32RegClass, which + // won't ever beconstrained any further. + if (RC != &AMDGPU::SGPR_32RegClass) return false; const LLT Ty = MRI.getType(Reg); return Ty.isValid() && Ty.getSizeInBits() == 1; @@ -83,7 +93,7 @@ static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) { bool AMDGPUInstructionSelector::isVCC(Register Reg, const MachineRegisterInfo &MRI) const { - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return Reg == TRI.getVCC(); auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); @@ -102,8 +112,6 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg, bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); I.setDesc(TII.get(TargetOpcode::COPY)); const MachineOperand &Src = I.getOperand(1); @@ -111,33 +119,33 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { Register DstReg = Dst.getReg(); Register SrcReg = Src.getReg(); - if (isVCC(DstReg, MRI)) { + if (isVCC(DstReg, *MRI)) { if (SrcReg == AMDGPU::SCC) { const TargetRegisterClass *RC - = TRI.getConstrainedRegClassForOperand(Dst, MRI); + = TRI.getConstrainedRegClassForOperand(Dst, *MRI); if (!RC) return true; - return RBI.constrainGenericRegister(DstReg, *RC, MRI); + return RBI.constrainGenericRegister(DstReg, *RC, *MRI); } - if (!isVCC(SrcReg, MRI)) { + if (!isVCC(SrcReg, *MRI)) { // TODO: Should probably leave the copy and let copyPhysReg expand it. - if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), MRI)) + if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) return false; BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) .addImm(0) .addReg(SrcReg); - if (!MRI.getRegClassOrNull(SrcReg)) - MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI)); + if (!MRI->getRegClassOrNull(SrcReg)) + MRI->setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, *MRI)); I.eraseFromParent(); return true; } const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(Dst, MRI); - if (RC && !RBI.constrainGenericRegister(DstReg, *RC, MRI)) + TRI.getConstrainedRegClassForOperand(Dst, *MRI); + if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) return false; // Don't constrain the source register to a class so the def instruction @@ -148,8 +156,8 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { // with size 1. An SReg_32 with size 1 is ambiguous with wave32. if (Src.isUndef()) { const TargetRegisterClass *SrcRC = - TRI.getConstrainedRegClassForOperand(Src, MRI); - if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) + TRI.getConstrainedRegClassForOperand(Src, *MRI); + if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) return false; } @@ -157,30 +165,26 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { } for (const MachineOperand &MO : I.operands()) { - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + if (Register::isPhysicalRegister(MO.getReg())) continue; const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, MRI); + TRI.getConstrainedRegClassForOperand(MO, *MRI); if (!RC) continue; - RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); + RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); } return true; } bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - const Register DefReg = I.getOperand(0).getReg(); - const LLT DefTy = MRI.getType(DefReg); + const LLT DefTy = MRI->getType(DefReg); // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) const RegClassOrRegBank &RegClassOrBank = - MRI.getRegClassOrRegBank(DefReg); + MRI->getRegClassOrRegBank(DefReg); const TargetRegisterClass *DefRC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); @@ -196,7 +200,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { return false; } - DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI); + DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); if (!DefRC) { LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); return false; @@ -204,7 +208,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { } I.setDesc(TII.get(TargetOpcode::PHI)); - return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); + return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); } MachineOperand @@ -214,13 +218,11 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, MachineInstr *MI = MO.getParent(); MachineBasicBlock *BB = MO.getParent()->getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - Register DstReg = MRI.createVirtualRegister(&SubRC); + Register DstReg = MRI->createVirtualRegister(&SubRC); if (MO.isReg()) { unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) .addReg(Reg, 0, ComposedSubIdx); @@ -244,10 +246,6 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, } } -static int64_t getConstant(const MachineInstr *MI) { - return MI->getOperand(1).getCImm()->getSExtValue(); -} - static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { switch (Opc) { case AMDGPU::G_AND: @@ -262,16 +260,13 @@ static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { } bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineOperand &Dst = I.getOperand(0); MachineOperand &Src0 = I.getOperand(1); MachineOperand &Src1 = I.getOperand(2); Register DstReg = Dst.getReg(); - unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); - const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); if (DstRB->getID() == AMDGPU::VCCRegBankID) { const TargetRegisterClass *RC = TRI.getBoolRC(); unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), @@ -282,12 +277,12 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { // The selector for G_ICMP relies on seeing the register bank for the result // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will // be ambiguous whether it's a scalar or vector bool. - if (Src0.isUndef() && !MRI.getRegClassOrNull(Src0.getReg())) - MRI.setRegClass(Src0.getReg(), RC); - if (Src1.isUndef() && !MRI.getRegClassOrNull(Src1.getReg())) - MRI.setRegClass(Src1.getReg(), RC); + if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) + MRI->setRegClass(Src0.getReg(), RC); + if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) + MRI->setRegClass(Src1.getReg(), RC); - return RBI.constrainGenericRegister(DstReg, *RC, MRI); + return RBI.constrainGenericRegister(DstReg, *RC, *MRI); } // TODO: Should this allow an SCC bank result, and produce a copy from SCC for @@ -295,14 +290,7 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { if (DstRB->getID() == AMDGPU::SGPRRegBankID) { unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); I.setDesc(TII.get(InstOpc)); - - const TargetRegisterClass *RC - = TRI.getConstrainedRegClassForOperand(Dst, MRI); - if (!RC) - return false; - return RBI.constrainGenericRegister(DstReg, *RC, MRI) && - RBI.constrainGenericRegister(Src0.getReg(), *RC, MRI) && - RBI.constrainGenericRegister(Src1.getReg(), *RC, MRI); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } return false; @@ -311,11 +299,10 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); Register DstReg = I.getOperand(0).getReg(); const DebugLoc &DL = I.getDebugLoc(); - unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); - const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; @@ -340,7 +327,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64; - Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass()); + Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); MachineInstr *Add = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) .addDef(UnusedCarry, RegState::Dead) @@ -363,8 +350,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); - Register DstLo = MRI.createVirtualRegister(&HalfRC); - Register DstHi = MRI.createVirtualRegister(&HalfRC); + Register DstLo = MRI->createVirtualRegister(&HalfRC); + Register DstHi = MRI->createVirtualRegister(&HalfRC); if (IsSALU) { BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) @@ -375,14 +362,14 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { .add(Hi2); } else { const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); - Register CarryReg = MRI.createVirtualRegister(CarryRC); + Register CarryReg = MRI->createVirtualRegister(CarryRC); BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo) .addDef(CarryReg) .add(Lo1) .add(Lo2) .addImm(0); MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) - .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead) + .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) .add(Hi1) .add(Hi2) .addReg(CarryReg, RegState::Kill) @@ -399,19 +386,61 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { .addImm(AMDGPU::sub1); - if (!RBI.constrainGenericRegister(DstReg, RC, MRI)) + if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) return false; I.eraseFromParent(); return true; } -bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { +bool AMDGPUInstructionSelector::selectG_UADDO_USUBO(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - assert(I.getOperand(2).getImm() % 32 == 0); - unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32); + const DebugLoc &DL = I.getDebugLoc(); + Register Dst0Reg = I.getOperand(0).getReg(); + Register Dst1Reg = I.getOperand(1).getReg(); + const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO; + + if (!isSCC(Dst1Reg, MRI)) { + // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned + // carry out despite the _i32 name. These were renamed in VI to _U32. + // FIXME: We should probably rename the opcodes here. + unsigned NewOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + I.setDesc(TII.get(NewOpc)); + I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + I.addOperand(*MF, MachineOperand::CreateImm(0)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + Register Src0Reg = I.getOperand(2).getReg(); + Register Src1Reg = I.getOperand(3).getReg(); + unsigned NewOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; + BuildMI(*BB, &I, DL, TII.get(NewOpc), Dst0Reg) + .add(I.getOperand(2)) + .add(I.getOperand(3)); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) + .addReg(AMDGPU::SCC); + + if (!MRI.getRegClassOrNull(Dst1Reg)) + MRI.setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); + + if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, MRI) || + !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, MRI) || + !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, MRI)) + return false; + + I.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + unsigned Offset = I.getOperand(2).getImm(); + if (Offset % 32 != 0) + return false; + + unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32); const DebugLoc &DL = I.getDebugLoc(); MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), I.getOperand(0).getReg()) @@ -419,10 +448,10 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { for (const MachineOperand &MO : Copy->operands()) { const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, MRI); + TRI.getConstrainedRegClassForOperand(MO, *MRI); if (!RC) continue; - RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); + RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); } I.eraseFromParent(); return true; @@ -430,21 +459,19 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { MachineBasicBlock *BB = MI.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + LLT DstTy = MRI->getType(DstReg); + LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); const unsigned SrcSize = SrcTy.getSizeInBits(); if (SrcSize < 32) return false; const DebugLoc &DL = MI.getDebugLoc(); - const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); const unsigned DstSize = DstTy.getSizeInBits(); const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI); + TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); if (!DstRC) return false; @@ -457,12 +484,12 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { MIB.addImm(SubRegs[I]); const TargetRegisterClass *SrcRC - = TRI.getConstrainedRegClassForOperand(Src, MRI); - if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI)) + = TRI.getConstrainedRegClassForOperand(Src, *MRI); + if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) return false; } - if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) + if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) return false; MI.eraseFromParent(); @@ -471,25 +498,23 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { MachineBasicBlock *BB = MI.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); const int NumDst = MI.getNumOperands() - 1; MachineOperand &Src = MI.getOperand(NumDst); Register SrcReg = Src.getReg(); Register DstReg0 = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg0); - LLT SrcTy = MRI.getType(SrcReg); + LLT DstTy = MRI->getType(DstReg0); + LLT SrcTy = MRI->getType(SrcReg); const unsigned DstSize = DstTy.getSizeInBits(); const unsigned SrcSize = SrcTy.getSizeInBits(); const DebugLoc &DL = MI.getDebugLoc(); - const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); + const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); const TargetRegisterClass *SrcRC = - TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI); - if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI)) + TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); + if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) return false; const unsigned SrcFlags = getUndefRegState(Src.isUndef()); @@ -504,8 +529,8 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { .addReg(SrcReg, SrcFlags, SubRegs[I]); const TargetRegisterClass *DstRC = - TRI.getConstrainedRegClassForOperand(Dst, MRI); - if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI)) + TRI.getConstrainedRegClassForOperand(Dst, *MRI); + if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) return false; } @@ -518,16 +543,13 @@ bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const { } bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); const MachineOperand &MO = I.getOperand(0); // FIXME: Interface for getConstrainedRegClassForOperand needs work. The // regbank check here is to know why getConstrainedRegClassForOperand failed. - const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI); - if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) || - (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) { + const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); + if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || + (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); return true; } @@ -537,44 +559,62 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32); - DebugLoc DL = I.getDebugLoc(); - MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG)) - .addDef(I.getOperand(0).getReg()) - .addReg(I.getOperand(1).getReg()) - .addReg(I.getOperand(2).getReg()) - .addImm(SubReg); - - for (const MachineOperand &MO : Ins->operands()) { - if (!MO.isReg()) - continue; - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) - continue; - const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, MRI); - if (!RC) - continue; - RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); - } + Register DstReg = I.getOperand(0).getReg(); + Register Src0Reg = I.getOperand(1).getReg(); + Register Src1Reg = I.getOperand(2).getReg(); + LLT Src1Ty = MRI->getType(Src1Reg); + + unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); + unsigned InsSize = Src1Ty.getSizeInBits(); + + int64_t Offset = I.getOperand(3).getImm(); + if (Offset % 32 != 0) + return false; + + unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); + if (SubReg == AMDGPU::NoSubRegister) + return false; + + const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); + if (!DstRC) + return false; + + const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); + const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); + const TargetRegisterClass *Src0RC = + TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); + const TargetRegisterClass *Src1RC = + TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); + + // Deal with weird cases where the class only partially supports the subreg + // index. + Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); + if (!Src0RC) + return false; + + if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || + !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || + !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) + return false; + + const DebugLoc &DL = I.getDebugLoc(); + BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) + .addReg(Src0Reg) + .addReg(Src1Reg) + .addImm(SubReg); + I.eraseFromParent(); return true; } -bool AMDGPUInstructionSelector::selectG_INTRINSIC( - MachineInstr &I, CodeGenCoverage &CoverageInfo) const { - unsigned IntrinsicID = I.getOperand(I.getNumExplicitDefs()).getIntrinsicID(); +bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { + unsigned IntrinsicID = I.getIntrinsicID(); switch (IntrinsicID) { - case Intrinsic::maxnum: - case Intrinsic::minnum: - case Intrinsic::amdgcn_cvt_pkrtz: - return selectImpl(I, CoverageInfo); case Intrinsic::amdgcn_if_break: { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick // SelectionDAG uses for wave32 vs wave64. @@ -589,15 +629,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC( I.eraseFromParent(); - for (Register Reg : { DstReg, Src0Reg, Src1Reg }) { - if (!MRI.getRegClassOrNull(Reg)) - MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); - } + for (Register Reg : { DstReg, Src0Reg, Src1Reg }) + MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); return true; } default: - return selectImpl(I, CoverageInfo); + return selectImpl(I, *CoverageInfo); } } @@ -677,17 +715,15 @@ int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = I.getDebugLoc(); - unsigned SrcReg = I.getOperand(2).getReg(); - unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI); + Register SrcReg = I.getOperand(2).getReg(); + unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); - unsigned CCReg = I.getOperand(0).getReg(); - if (isSCC(CCReg, MRI)) { + Register CCReg = I.getOperand(0).getReg(); + if (isSCC(CCReg, *MRI)) { int Opcode = getS_CMPOpcode(Pred, Size); if (Opcode == -1) return false; @@ -698,7 +734,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { .addReg(AMDGPU::SCC); bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && - RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI); + RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); I.eraseFromParent(); return Ret; } @@ -712,7 +748,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { .add(I.getOperand(2)) .add(I.getOperand(3)); RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), - *TRI.getBoolRC(), MRI); + *TRI.getBoolRC(), *MRI); bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); I.eraseFromParent(); return Ret; @@ -736,19 +772,273 @@ buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, .addImm(Enabled); } +static bool isZero(Register Reg, MachineRegisterInfo &MRI) { + int64_t C; + if (mi_match(Reg, MRI, m_ICst(C)) && C == 0) + return true; + + // FIXME: matcher should ignore copies + return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; +} + +static unsigned extractGLC(unsigned AuxiliaryData) { + return AuxiliaryData & 1; +} + +static unsigned extractSLC(unsigned AuxiliaryData) { + return (AuxiliaryData >> 1) & 1; +} + +static unsigned extractDLC(unsigned AuxiliaryData) { + return (AuxiliaryData >> 2) & 1; +} + +static unsigned extractSWZ(unsigned AuxiliaryData) { + return (AuxiliaryData >> 3) & 1; +} + +// Returns Base register, constant offset, and offset def point. +static std::tuple<Register, unsigned, MachineInstr *> +getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { + MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (!Def) + return std::make_tuple(Reg, 0, nullptr); + + if (Def->getOpcode() == AMDGPU::G_CONSTANT) { + unsigned Offset; + const MachineOperand &Op = Def->getOperand(1); + if (Op.isImm()) + Offset = Op.getImm(); + else + Offset = Op.getCImm()->getZExtValue(); + + return std::make_tuple(Register(), Offset, Def); + } + + int64_t Offset; + if (Def->getOpcode() == AMDGPU::G_ADD) { + // TODO: Handle G_OR used for add case + if (mi_match(Def->getOperand(1).getReg(), MRI, m_ICst(Offset))) + return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def); + + // FIXME: matcher should ignore copies + if (mi_match(Def->getOperand(1).getReg(), MRI, m_Copy(m_ICst(Offset)))) + return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def); + } + + return std::make_tuple(Reg, 0, Def); +} + +static unsigned getBufferStoreOpcode(LLT Ty, + const unsigned MemSize, + const bool Offen) { + const int Size = Ty.getSizeInBits(); + switch (8 * MemSize) { + case 8: + return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : + AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; + case 16: + return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : + AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; + default: + unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : + AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; + if (Size > 32) + Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); + return Opc; + } +} + +static unsigned getBufferStoreFormatOpcode(LLT Ty, + const unsigned MemSize, + const bool Offen) { + bool IsD16Packed = Ty.getScalarSizeInBits() == 16; + bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits(); + int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; + + if (IsD16Packed) { + switch (NumElts) { + case 1: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; + case 2: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact; + case 3: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact; + case 4: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact; + default: + return -1; + } + } + + if (IsD16Unpacked) { + switch (NumElts) { + case 1: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; + case 2: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact; + case 3: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact; + case 4: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact; + default: + return -1; + } + } + + switch (NumElts) { + case 1: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact; + case 2: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact; + case 3: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact; + case 4: + return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact : + AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact; + default: + return -1; + } + + llvm_unreachable("unhandled buffer store"); +} + +// TODO: Move this to combiner +// Returns base register, imm offset, total constant offset. +std::tuple<Register, unsigned, unsigned> +AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B, + Register OrigOffset) const { + const unsigned MaxImm = 4095; + Register BaseReg; + unsigned TotalConstOffset; + MachineInstr *OffsetDef; + + std::tie(BaseReg, TotalConstOffset, OffsetDef) + = getBaseWithConstantOffset(*MRI, OrigOffset); + + unsigned ImmOffset = TotalConstOffset; + + // If the immediate value is too big for the immoffset field, put the value + // and -4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store.f + // However, do not do that rounding down to a multiple of 4096 if that is a + // negative number, as it appears to be illegal to have a negative offset + // in the vgpr, even if adding the immediate offset makes it positive. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + if ((int32_t)Overflow < 0) { + Overflow += ImmOffset; + ImmOffset = 0; + } + + if (Overflow != 0) { + // In case this is in a waterfall loop, insert offset code at the def point + // of the offset, not inside the loop. + MachineBasicBlock::iterator OldInsPt = B.getInsertPt(); + MachineBasicBlock &OldMBB = B.getMBB(); + B.setInstr(*OffsetDef); + + if (!BaseReg) { + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + B.buildInstr(AMDGPU::V_MOV_B32_e32) + .addDef(BaseReg) + .addImm(Overflow); + } else { + Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + B.buildInstr(AMDGPU::V_MOV_B32_e32) + .addDef(OverflowVal) + .addImm(Overflow); + + Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg) + .addReg(BaseReg) + .addReg(OverflowVal, RegState::Kill) + .addImm(0); + BaseReg = NewBaseReg; + } + + B.setInsertPt(OldMBB, OldInsPt); + } + + return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); +} + +bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, + bool IsFormat) const { + MachineIRBuilder B(MI); + MachineFunction &MF = B.getMF(); + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI->getType(VData); + + int Size = Ty.getSizeInBits(); + if (Size % 32 != 0) + return false; + + // FIXME: Verifier should enforce 1 MMO for these intrinsics. + MachineMemOperand *MMO = *MI.memoperands_begin(); + const int MemSize = MMO->getSize(); + + Register RSrc = MI.getOperand(2).getReg(); + Register VOffset = MI.getOperand(3).getReg(); + Register SOffset = MI.getOperand(4).getReg(); + unsigned AuxiliaryData = MI.getOperand(5).getImm(); + unsigned ImmOffset; + unsigned TotalOffset; + + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize); + + const bool Offen = !isZero(VOffset, *MRI); + + int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) : + getBufferStoreOpcode(Ty, MemSize, Offen); + if (Opc == -1) + return false; + + MachineInstrBuilder MIB = B.buildInstr(Opc) + .addUse(VData); + + if (Offen) + MIB.addUse(VOffset); + + MIB.addUse(RSrc) + .addUse(SOffset) + .addImm(ImmOffset) + .addImm(extractGLC(AuxiliaryData)) + .addImm(extractSLC(AuxiliaryData)) + .addImm(0) // tfe: FIXME: Remove from inst + .addImm(extractDLC(AuxiliaryData)) + .addImm(extractSWZ(AuxiliaryData)) + .addMemOperand(MMO); + + MI.eraseFromParent(); + + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( - MachineInstr &I, CodeGenCoverage &CoverageInfo) const { + MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - unsigned IntrinsicID = I.getOperand(0).getIntrinsicID(); + unsigned IntrinsicID = I.getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_exp: { - int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); - int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); - int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg())); - int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg())); + int64_t Tgt = I.getOperand(1).getImm(); + int64_t Enabled = I.getOperand(2).getImm(); + int64_t Done = I.getOperand(7).getImm(); + int64_t VM = I.getOperand(8).getImm(); MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(), I.getOperand(4).getReg(), @@ -761,13 +1051,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( } case Intrinsic::amdgcn_exp_compr: { const DebugLoc &DL = I.getDebugLoc(); - int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); - int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); - unsigned Reg0 = I.getOperand(3).getReg(); - unsigned Reg1 = I.getOperand(4).getReg(); - unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg())); - int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg())); + int64_t Tgt = I.getOperand(1).getImm(); + int64_t Enabled = I.getOperand(2).getImm(); + Register Reg0 = I.getOperand(3).getReg(); + Register Reg1 = I.getOperand(4).getReg(); + Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + int64_t Done = I.getOperand(5).getImm(); + int64_t VM = I.getOperand(6).getImm(); BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM, @@ -786,27 +1076,29 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( Register Reg = I.getOperand(1).getReg(); I.eraseFromParent(); - if (!MRI.getRegClassOrNull(Reg)) - MRI.setRegClass(Reg, TRI.getWaveMaskRegClass()); + if (!MRI->getRegClassOrNull(Reg)) + MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); return true; } + case Intrinsic::amdgcn_raw_buffer_store: + return selectStoreIntrinsic(I, false); + case Intrinsic::amdgcn_raw_buffer_store_format: + return selectStoreIntrinsic(I, true); default: - return selectImpl(I, CoverageInfo); + return selectImpl(I, *CoverageInfo); } } bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = I.getDebugLoc(); - unsigned DstReg = I.getOperand(0).getReg(); - unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + Register DstReg = I.getOperand(0).getReg(); + unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); assert(Size <= 32 || Size == 64); const MachineOperand &CCOp = I.getOperand(1); - unsigned CCReg = CCOp.getReg(); - if (isSCC(CCReg, MRI)) { + Register CCReg = CCOp.getReg(); + if (isSCC(CCReg, *MRI)) { unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) @@ -815,8 +1107,8 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { // The generic constrainSelectedInstRegOperands doesn't work for the scc register // bank, because it does not cover the register class that we used to represent // for it. So we need to manually set the register class here. - if (!MRI.getRegClassOrNull(CCReg)) - MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI)); + if (!MRI->getRegClassOrNull(CCReg)) + MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) .add(I.getOperand(2)) .add(I.getOperand(3)); @@ -845,52 +1137,8 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { } bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - DebugLoc DL = I.getDebugLoc(); - unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI); - if (PtrSize != 64) { - LLVM_DEBUG(dbgs() << "Unhandled address space\n"); - return false; - } - - unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); - unsigned Opcode; - - // FIXME: Remove this when integers > s32 naturally selected. - switch (StoreSize) { - default: - return false; - case 32: - Opcode = AMDGPU::FLAT_STORE_DWORD; - break; - case 64: - Opcode = AMDGPU::FLAT_STORE_DWORDX2; - break; - case 96: - Opcode = AMDGPU::FLAT_STORE_DWORDX3; - break; - case 128: - Opcode = AMDGPU::FLAT_STORE_DWORDX4; - break; - } - - MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode)) - .add(I.getOperand(1)) - .add(I.getOperand(0)) - .addImm(0) // offset - .addImm(0) // glc - .addImm(0) // slc - .addImm(0); // dlc - - - // Now that we selected an opcode, we need to constrain the register - // operands to use appropriate classes. - bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); - - I.eraseFromParent(); - return Ret; + initM0(I); + return selectImpl(I, *CoverageInfo); } static int sizeToSubRegIndex(unsigned Size) { @@ -915,19 +1163,15 @@ static int sizeToSubRegIndex(unsigned Size) { } bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - unsigned DstReg = I.getOperand(0).getReg(); - unsigned SrcReg = I.getOperand(1).getReg(); - const LLT DstTy = MRI.getType(DstReg); - const LLT SrcTy = MRI.getType(SrcReg); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + const LLT DstTy = MRI->getType(DstReg); + const LLT SrcTy = MRI->getType(SrcReg); if (!DstTy.isScalar()) return false; - const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); - const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); if (SrcRB != DstRB) return false; @@ -935,9 +1179,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { unsigned SrcSize = SrcTy.getSizeInBits(); const TargetRegisterClass *SrcRC - = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI); + = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); const TargetRegisterClass *DstRC - = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI); + = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); if (SrcSize > 32) { int SubRegIdx = sizeToSubRegIndex(DstSize); @@ -953,8 +1197,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { I.getOperand(1).setSubReg(SubRegIdx); } - if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || - !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); return false; } @@ -974,20 +1218,18 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { bool Signed = I.getOpcode() == AMDGPU::G_SEXT; const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock &MBB = *I.getParent(); - MachineFunction &MF = *MBB.getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); - const LLT DstTy = MRI.getType(DstReg); - const LLT SrcTy = MRI.getType(SrcReg); + const LLT DstTy = MRI->getType(DstReg); + const LLT SrcTy = MRI->getType(SrcReg); const LLT S1 = LLT::scalar(1); const unsigned SrcSize = SrcTy.getSizeInBits(); const unsigned DstSize = DstTy.getSizeInBits(); if (!DstTy.isScalar()) return false; - const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI); + const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); if (SrcBank->getID() == AMDGPU::SCCRegBankID) { if (SrcTy != S1 || DstSize > 64) // Invalid @@ -1000,7 +1242,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { // FIXME: Create an extra copy to avoid incorrectly constraining the result // of the scc producer. - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg) .addReg(SrcReg); BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) @@ -1010,7 +1252,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { BuildMI(MBB, I, DL, TII.get(Opcode), DstReg) .addImm(0) .addImm(Signed ? -1 : 1); - return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); } if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) { @@ -1024,6 +1267,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { .addImm(0) // src1_modifiers .addImm(Signed ? -1 : 1) // src1 .addUse(SrcReg); + I.eraseFromParent(); return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); } @@ -1040,6 +1284,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) .addImm(Mask) .addReg(SrcReg); + I.eraseFromParent(); return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); } @@ -1049,11 +1294,12 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { .addReg(SrcReg) .addImm(0) // Offset .addImm(SrcSize); // Width + I.eraseFromParent(); return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); } if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { - if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI)) + if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) return false; if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { @@ -1061,7 +1307,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) .addReg(SrcReg); - return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); } const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; @@ -1070,10 +1317,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. if (DstSize > 32 && SrcSize <= 32) { // We need a 64-bit register source, but the high bits don't matter. - unsigned ExtReg - = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned UndefReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) .addReg(SrcReg) @@ -1085,7 +1330,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { .addReg(ExtReg) .addImm(SrcSize << 16); - return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); } unsigned Mask; @@ -1099,16 +1345,58 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { .addImm(SrcSize << 16); } - return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); } return false; } +static int64_t getFPTrueImmVal(unsigned Size, bool Signed) { + switch (Size) { + case 16: + return Signed ? 0xBC00 : 0x3C00; + case 32: + return Signed ? 0xbf800000 : 0x3f800000; + case 64: + return Signed ? 0xbff0000000000000 : 0x3ff0000000000000; + default: + llvm_unreachable("Invalid FP type size"); + } +} + +bool AMDGPUInstructionSelector::selectG_SITOFP_UITOFP(MachineInstr &I) const { + MachineBasicBlock *MBB = I.getParent(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register Src = I.getOperand(1).getReg(); + if (!isSCC(Src, MRI)) + return selectImpl(I, *CoverageInfo); + + bool Signed = I.getOpcode() == AMDGPU::G_SITOFP; + Register DstReg = I.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + const unsigned DstSize = DstTy.getSizeInBits(); + const DebugLoc &DL = I.getDebugLoc(); + + BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(Src); + + unsigned NewOpc = + DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; + auto MIB = BuildMI(*MBB, I, DL, TII.get(NewOpc), DstReg) + .addImm(0) + .addImm(getFPTrueImmVal(DstSize, Signed)); + + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) + return false; + + I.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineOperand &ImmOp = I.getOperand(1); // The AMDGPU backend only supports Imm operands and not CImm or FPImm. @@ -1119,15 +1407,15 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); } - unsigned DstReg = I.getOperand(0).getReg(); + Register DstReg = I.getOperand(0).getReg(); unsigned Size; bool IsSgpr; - const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg()); + const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg()); if (RB) { IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; - Size = MRI.getType(DstReg).getSizeInBits(); + Size = MRI->getType(DstReg).getSizeInBits(); } else { - const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg); + const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg); IsSgpr = TRI.isSGPRClass(RC); Size = TRI.getRegSizeInBits(*RC); } @@ -1142,34 +1430,41 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - DebugLoc DL = I.getDebugLoc(); - const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass : - &AMDGPU::VGPR_32RegClass; - unsigned LoReg = MRI.createVirtualRegister(RC); - unsigned HiReg = MRI.createVirtualRegister(RC); - const APInt &Imm = APInt(Size, I.getOperand(1).getImm()); - - BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) - .addImm(Imm.trunc(32).getZExtValue()); + const DebugLoc &DL = I.getDebugLoc(); - BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) - .addImm(Imm.ashr(32).getZExtValue()); + APInt Imm(Size, I.getOperand(1).getImm()); - const MachineInstr *RS = - BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) - .addReg(LoReg) - .addImm(AMDGPU::sub0) - .addReg(HiReg) - .addImm(AMDGPU::sub1); + MachineInstr *ResInst; + if (IsSgpr && TII.isInlineConstant(Imm)) { + ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) + .addImm(I.getOperand(1).getImm()); + } else { + const TargetRegisterClass *RC = IsSgpr ? + &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; + Register LoReg = MRI->createVirtualRegister(RC); + Register HiReg = MRI->createVirtualRegister(RC); + + BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) + .addImm(Imm.trunc(32).getZExtValue()); + + BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) + .addImm(Imm.ashr(32).getZExtValue()); + + ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(LoReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + } // We can't call constrainSelectedInstRegOperands here, because it doesn't // work for target independent opcodes I.eraseFromParent(); const TargetRegisterClass *DstRC = - TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI); + TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); if (!DstRC) return true; - return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); + return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); } static bool isConstant(const MachineInstr &MI) { @@ -1188,13 +1483,13 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, GEPInfo GEPInfo(*PtrMI); - for (unsigned i = 1, e = 3; i < e; ++i) { + for (unsigned i = 1; i != 3; ++i) { const MachineOperand &GEPOp = PtrMI->getOperand(i); const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); assert(OpDef); - if (isConstant(*OpDef)) { - // FIXME: Is it possible to have multiple Imm parts? Maybe if we - // are lacking other optimizations. + if (i == 2 && isConstant(*OpDef)) { + // TODO: Could handle constant base + variable offset, but a combine + // probably should have commuted it. assert(GEPInfo.Imm == 0); GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); continue; @@ -1240,16 +1535,26 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { return false; } -bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { - // TODO: Can/should we insert m0 initialization here for DS instructions and - // call the normal selector? - return false; +void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + + const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); + unsigned AS = PtrTy.getAddressSpace(); + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && + STI.ldsRequiresM0Init()) { + // If DS instructions require M0 initializtion, insert it before selecting. + BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addImm(-1); + } +} + +bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { + initM0(I); + return selectImpl(I, *CoverageInfo); } bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineOperand &CondOp = I.getOperand(0); Register CondReg = CondOp.getReg(); const DebugLoc &DL = I.getDebugLoc(); @@ -1263,11 +1568,12 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { // GlobalISel, we should push that decision into RegBankSelect. Assume for now // RegBankSelect knows what it's doing if the branch condition is scc, even // though it currently does not. - if (isSCC(CondReg, MRI)) { + if (isSCC(CondReg, *MRI)) { CondPhysReg = AMDGPU::SCC; BrOpcode = AMDGPU::S_CBRANCH_SCC1; - ConstrainRC = &AMDGPU::SReg_32_XM0RegClass; - } else if (isVCC(CondReg, MRI)) { + // FIXME: Hack for isSCC tests + ConstrainRC = &AMDGPU::SGPR_32RegClass; + } else if (isVCC(CondReg, *MRI)) { // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? // We sort of know that a VCC producer based on the register bank, that ands // inactive lanes with 0. What if there was a logical operation with vcc @@ -1279,8 +1585,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { } else return false; - if (!MRI.getRegClassOrNull(CondReg)) - MRI.setRegClass(CondReg, ConstrainRC); + if (!MRI->getRegClassOrNull(CondReg)) + MRI->setRegClass(CondReg, ConstrainRC); BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) .addReg(CondReg); @@ -1292,27 +1598,83 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { } bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - Register DstReg = I.getOperand(0).getReg(); - const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); if (IsVGPR) I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); return RBI.constrainGenericRegister( - DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI); + DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); +} + +bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { + uint64_t Align = I.getOperand(2).getImm(); + const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); + + MachineBasicBlock *BB = I.getParent(); + + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); + const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; + unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; + unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + const TargetRegisterClass &RegRC + = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; + + LLT Ty = MRI->getType(DstReg); + + const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, + *MRI); + const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, + *MRI); + if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || + !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) + return false; + + const DebugLoc &DL = I.getDebugLoc(); + Register ImmReg = MRI->createVirtualRegister(&RegRC); + BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) + .addImm(Mask); + + if (Ty.getSizeInBits() == 32) { + BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) + .addReg(SrcReg) + .addReg(ImmReg); + I.eraseFromParent(); + return true; + } + + Register HiReg = MRI->createVirtualRegister(&RegRC); + Register LoReg = MRI->createVirtualRegister(&RegRC); + Register MaskLo = MRI->createVirtualRegister(&RegRC); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) + .addReg(SrcReg, 0, AMDGPU::sub0); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) + .addReg(SrcReg, 0, AMDGPU::sub1); + + BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) + .addReg(LoReg) + .addReg(ImmReg); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(MaskLo) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + I.eraseFromParent(); + return true; } -bool AMDGPUInstructionSelector::select(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { +bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); - if (!isPreISelGenericOpcode(I.getOpcode())) { + if (!I.isPreISelOpcode()) { if (I.isCopy()) return selectCOPY(I); return true; @@ -1324,16 +1686,18 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, case TargetOpcode::G_XOR: if (selectG_AND_OR_XOR(I)) return true; - return selectImpl(I, CoverageInfo); + return selectImpl(I, *CoverageInfo); case TargetOpcode::G_ADD: case TargetOpcode::G_SUB: - if (selectG_ADD_SUB(I)) + if (selectImpl(I, *CoverageInfo)) return true; - LLVM_FALLTHROUGH; - default: - return selectImpl(I, CoverageInfo); + return selectG_ADD_SUB(I); + case TargetOpcode::G_UADDO: + case TargetOpcode::G_USUBO: + return selectG_UADDO_USUBO(I); case TargetOpcode::G_INTTOPTR: case TargetOpcode::G_BITCAST: + case TargetOpcode::G_PTRTOINT: return selectCOPY(I); case TargetOpcode::G_CONSTANT: case TargetOpcode::G_FCONSTANT: @@ -1353,32 +1717,40 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, case TargetOpcode::G_INSERT: return selectG_INSERT(I); case TargetOpcode::G_INTRINSIC: - return selectG_INTRINSIC(I, CoverageInfo); + return selectG_INTRINSIC(I); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: - return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo); + return selectG_INTRINSIC_W_SIDE_EFFECTS(I); case TargetOpcode::G_ICMP: if (selectG_ICMP(I)) return true; - return selectImpl(I, CoverageInfo); + return selectImpl(I, *CoverageInfo); case TargetOpcode::G_LOAD: - return selectImpl(I, CoverageInfo); + case TargetOpcode::G_ATOMIC_CMPXCHG: + case TargetOpcode::G_ATOMICRMW_XCHG: + case TargetOpcode::G_ATOMICRMW_ADD: + case TargetOpcode::G_ATOMICRMW_SUB: + case TargetOpcode::G_ATOMICRMW_AND: + case TargetOpcode::G_ATOMICRMW_OR: + case TargetOpcode::G_ATOMICRMW_XOR: + case TargetOpcode::G_ATOMICRMW_MIN: + case TargetOpcode::G_ATOMICRMW_MAX: + case TargetOpcode::G_ATOMICRMW_UMIN: + case TargetOpcode::G_ATOMICRMW_UMAX: + case TargetOpcode::G_ATOMICRMW_FADD: + return selectG_LOAD_ATOMICRMW(I); case TargetOpcode::G_SELECT: return selectG_SELECT(I); case TargetOpcode::G_STORE: - if (selectImpl(I, CoverageInfo)) - return true; return selectG_STORE(I); case TargetOpcode::G_TRUNC: return selectG_TRUNC(I); case TargetOpcode::G_SEXT: case TargetOpcode::G_ZEXT: case TargetOpcode::G_ANYEXT: - if (selectG_SZA_EXT(I)) { - I.eraseFromParent(); - return true; - } - - return false; + return selectG_SZA_EXT(I); + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + return selectG_SITOFP_UITOFP(I); case TargetOpcode::G_BRCOND: return selectG_BRCOND(I); case TargetOpcode::G_FRAME_INDEX: @@ -1388,6 +1760,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, // is checking for G_CONSTANT I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE)); return true; + case TargetOpcode::G_PTR_MASK: + return selectG_PTR_MASK(I); + default: + return selectImpl(I, *CoverageInfo); } return false; } @@ -1402,14 +1778,14 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( - Register Src, const MachineRegisterInfo &MRI) const { + Register Src) const { unsigned Mods = 0; - MachineInstr *MI = MRI.getVRegDef(Src); + MachineInstr *MI = MRI->getVRegDef(Src); if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { Src = MI->getOperand(1).getReg(); Mods |= SISrcMods::NEG; - MI = MRI.getVRegDef(Src); + MI = MRI->getVRegDef(Src); } if (MI && MI->getOpcode() == AMDGPU::G_FABS) { @@ -1432,12 +1808,23 @@ AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { - MachineRegisterInfo &MRI - = Root.getParent()->getParent()->getParent()->getRegInfo(); + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod + }}; +} +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, @@ -1446,6 +1833,7 @@ AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod }}; } + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { return {{ @@ -1457,12 +1845,9 @@ AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { - MachineRegisterInfo &MRI - = Root.getParent()->getParent()->getParent()->getRegInfo(); - Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, @@ -1471,12 +1856,28 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { - MachineRegisterInfo &MRI = - Root.getParent()->getParent()->getParent()->getRegInfo(); +AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { + // FIXME: Handle clamp and op_sel + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp + }}; +} +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { + // FIXME: Handle op_sel + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { SmallVector<GEPInfo, 4> AddrInfo; - getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) return None; @@ -1496,11 +1897,8 @@ AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { - MachineRegisterInfo &MRI = - Root.getParent()->getParent()->getParent()->getRegInfo(); - SmallVector<GEPInfo, 4> AddrInfo; - getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) return None; @@ -1521,10 +1919,9 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); SmallVector<GEPInfo, 4> AddrInfo; - getAddrModeInfo(*MI, MRI, AddrInfo); + getAddrModeInfo(*MI, *MRI, AddrInfo); // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, // then we can select all ptr + 32-bit offsets not just immediate offsets. @@ -1540,7 +1937,7 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { // failed trying to select this load into one of the _IMM variants since // the _IMM Patterns are considered before the _SGPR patterns. unsigned PtrReg = GEPInfo.SgprParts[0]; - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(GEPInfo.Imm); return {{ @@ -1553,8 +1950,6 @@ template <bool Signed> InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); InstructionSelector::ComplexRendererFns Default = {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, @@ -1565,12 +1960,12 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { if (!STI.hasFlatInstOffsets()) return Default; - const MachineInstr *OpDef = MRI.getVRegDef(Root.getReg()); + const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg()); if (!OpDef || OpDef->getOpcode() != AMDGPU::G_GEP) return Default; Optional<int64_t> Offset = - getConstantVRegVal(OpDef->getOperand(2).getReg(), MRI); + getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI); if (!Offset.hasValue()) return Default; @@ -1597,12 +1992,6 @@ AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { return selectFlatOffsetImpl<true>(Root); } -// FIXME: Implement -static bool signBitIsZero(const MachineOperand &Op, - const MachineRegisterInfo &MRI) { - return false; -} - static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); return PSV && PSV->isStack(); @@ -1613,12 +2002,11 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); int64_t Offset = 0; - if (mi_match(Root.getReg(), MRI, m_ICst(Offset))) { - Register HighBits = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { + Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); // TODO: Should this be inside the render function? The iterator seems to // move. @@ -1652,18 +2040,18 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { // offsets. Optional<int> FI; Register VAddr = Root.getReg(); - if (const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg())) { - if (isBaseWithConstantOffset(Root, MRI)) { + if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { + if (isBaseWithConstantOffset(Root, *MRI)) { const MachineOperand &LHS = RootDef->getOperand(1); const MachineOperand &RHS = RootDef->getOperand(2); - const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); - const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); + const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); + const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); if (LHSDef && RHSDef) { int64_t PossibleOffset = RHSDef->getOperand(1).getCImm()->getSExtValue(); if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) && (!STI.privateMemoryResourceIsRangeChecked() || - signBitIsZero(LHS, MRI))) { + KnownBits->signBitIsZero(LHS.getReg()))) { if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX) FI = LHSDef->getOperand(1).getIndex(); else @@ -1700,15 +2088,30 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { }}}; } +bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI, + const MachineOperand &Base, + int64_t Offset, + unsigned OffsetBits) const { + if ((OffsetBits == 16 && !isUInt<16>(Offset)) || + (OffsetBits == 8 && !isUInt<8>(Offset))) + return false; + + if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) + return true; + + // On Southern Islands instruction with a negative base value and an offset + // don't seem to work. + return KnownBits->signBitIsZero(Base.getReg()); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffset( MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); int64_t Offset = 0; - if (!mi_match(Root.getReg(), MRI, m_ICst(Offset)) || + if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) return {}; @@ -1728,3 +2131,54 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset( [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset }}; } + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { + const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); + if (!RootDef) { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } + }}; + } + + int64_t ConstAddr = 0; + if (isBaseWithConstantOffset(Root, *MRI)) { + const MachineOperand &LHS = RootDef->getOperand(1); + const MachineOperand &RHS = RootDef->getOperand(2); + const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); + const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); + if (LHSDef && RHSDef) { + int64_t PossibleOffset = + RHSDef->getOperand(1).getCImm()->getSExtValue(); + if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) { + // (add n0, c0) + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); } + }}; + } + } + } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { + + + + } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { + + + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } + }}; +} + +void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, + const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); + Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI); + assert(CstVal && "Expected constant value"); + MIB.addImm(CstVal.getValue()); +} diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 4f489ddfb23d..d3c83a6a872a 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -35,6 +35,7 @@ class AMDGPUInstrInfo; class AMDGPURegisterBankInfo; class GCNSubtarget; class MachineInstr; +class MachineIRBuilder; class MachineOperand; class MachineRegisterInfo; class SIInstrInfo; @@ -42,14 +43,20 @@ class SIMachineFunctionInfo; class SIRegisterInfo; class AMDGPUInstructionSelector : public InstructionSelector { +private: + MachineRegisterInfo *MRI; + public: AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM); - bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + bool select(MachineInstr &I) override; static const char *getName(); + void setupMF(MachineFunction &MF, GISelKnownBits &KB, + CodeGenCoverage &CoverageInfo) override; + private: struct GEPInfo { const MachineInstr &GEP; @@ -72,32 +79,42 @@ private: bool selectPHI(MachineInstr &I) const; bool selectG_TRUNC(MachineInstr &I) const; bool selectG_SZA_EXT(MachineInstr &I) const; + bool selectG_SITOFP_UITOFP(MachineInstr &I) const; bool selectG_CONSTANT(MachineInstr &I) const; bool selectG_AND_OR_XOR(MachineInstr &I) const; bool selectG_ADD_SUB(MachineInstr &I) const; + bool selectG_UADDO_USUBO(MachineInstr &I) const; bool selectG_EXTRACT(MachineInstr &I) const; bool selectG_MERGE_VALUES(MachineInstr &I) const; bool selectG_UNMERGE_VALUES(MachineInstr &I) const; bool selectG_GEP(MachineInstr &I) const; bool selectG_IMPLICIT_DEF(MachineInstr &I) const; bool selectG_INSERT(MachineInstr &I) const; - bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; - bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const; + bool selectG_INTRINSIC(MachineInstr &I) const; + + std::tuple<Register, unsigned, unsigned> + splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const; + + bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const; + + bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const; int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const; bool selectG_ICMP(MachineInstr &I) const; bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const; void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const; bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const; - bool selectG_LOAD(MachineInstr &I) const; - bool selectG_SELECT(MachineInstr &I) const; + + void initM0(MachineInstr &I) const; + bool selectG_LOAD_ATOMICRMW(MachineInstr &I) const; bool selectG_STORE(MachineInstr &I) const; + bool selectG_SELECT(MachineInstr &I) const; bool selectG_BRCOND(MachineInstr &I) const; bool selectG_FRAME_INDEX(MachineInstr &I) const; + bool selectG_PTR_MASK(MachineInstr &I) const; std::pair<Register, unsigned> - selectVOP3ModsImpl(Register Src, const MachineRegisterInfo &MRI) const; + selectVOP3ModsImpl(Register Src) const; InstructionSelector::ComplexRendererFns selectVCSRC(MachineOperand &Root) const; @@ -108,11 +125,18 @@ private: InstructionSelector::ComplexRendererFns selectVOP3Mods0(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OMods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectVOP3Mods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3OpSelMods0(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3OpSelMods(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectSmrdImm32(MachineOperand &Root) const; @@ -133,6 +157,16 @@ private: InstructionSelector::ComplexRendererFns selectMUBUFScratchOffset(MachineOperand &Root) const; + bool isDSOffsetLegal(const MachineRegisterInfo &MRI, + const MachineOperand &Base, + int64_t Offset, unsigned OffsetBits) const; + + InstructionSelector::ComplexRendererFns + selectDS1Addr1Offset(MachineOperand &Root) const; + + void renderTruncImm32(MachineInstrBuilder &MIB, + const MachineInstr &MI) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 61bc415c839d..846e7f577a28 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -75,7 +75,7 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> let isCodeGenOnly = 1; } -def TruePredicate : Predicate<"true">; +def TruePredicate : Predicate<"">; class PredicateControl { Predicate SubtargetPredicate = TruePredicate; @@ -220,80 +220,48 @@ def hi_f16_elt : PatLeaf< // PatLeafs for floating-point comparisons //===----------------------------------------------------------------------===// -def COND_OEQ : PatLeaf < - (cond), - [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}] ->; - -def COND_ONE : PatLeaf < - (cond), - [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}] ->; - -def COND_OGT : PatLeaf < - (cond), - [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}] ->; - -def COND_OGE : PatLeaf < - (cond), - [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}] ->; - -def COND_OLT : PatLeaf < - (cond), - [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}] ->; - -def COND_OLE : PatLeaf < - (cond), - [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] ->; - -def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; -def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; +def COND_OEQ : PatFrags<(ops), [(OtherVT SETOEQ), (OtherVT SETEQ)]>; +def COND_ONE : PatFrags<(ops), [(OtherVT SETONE), (OtherVT SETNE)]>; +def COND_OGT : PatFrags<(ops), [(OtherVT SETOGT), (OtherVT SETGT)]>; +def COND_OGE : PatFrags<(ops), [(OtherVT SETOGE), (OtherVT SETGE)]>; +def COND_OLT : PatFrags<(ops), [(OtherVT SETOLT), (OtherVT SETLT)]>; +def COND_OLE : PatFrags<(ops), [(OtherVT SETOLE), (OtherVT SETLE)]>; +def COND_O : PatFrags<(ops), [(OtherVT SETO)]>; +def COND_UO : PatFrags<(ops), [(OtherVT SETUO)]>; //===----------------------------------------------------------------------===// // PatLeafs for unsigned / unordered comparisons //===----------------------------------------------------------------------===// -def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>; -def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>; -def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>; -def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>; -def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>; -def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>; +def COND_UEQ : PatFrag<(ops), (OtherVT SETUEQ)>; +def COND_UNE : PatFrag<(ops), (OtherVT SETUNE)>; +def COND_UGT : PatFrag<(ops), (OtherVT SETUGT)>; +def COND_UGE : PatFrag<(ops), (OtherVT SETUGE)>; +def COND_ULT : PatFrag<(ops), (OtherVT SETULT)>; +def COND_ULE : PatFrag<(ops), (OtherVT SETULE)>; // XXX - For some reason R600 version is preferring to use unordered // for setne? -def COND_UNE_NE : PatLeaf < - (cond), - [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] ->; +def COND_UNE_NE : PatFrags<(ops), [(OtherVT SETUNE), (OtherVT SETNE)]>; //===----------------------------------------------------------------------===// // PatLeafs for signed comparisons //===----------------------------------------------------------------------===// -def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>; -def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>; -def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>; -def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>; +def COND_SGT : PatFrag<(ops), (OtherVT SETGT)>; +def COND_SGE : PatFrag<(ops), (OtherVT SETGE)>; +def COND_SLT : PatFrag<(ops), (OtherVT SETLT)>; +def COND_SLE : PatFrag<(ops), (OtherVT SETLE)>; //===----------------------------------------------------------------------===// // PatLeafs for integer equality //===----------------------------------------------------------------------===// -def COND_EQ : PatLeaf < - (cond), - [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}] ->; - -def COND_NE : PatLeaf < - (cond), - [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}] ->; +def COND_EQ : PatFrags<(ops), [(OtherVT SETEQ), (OtherVT SETUEQ)]>; +def COND_NE : PatFrags<(ops), [(OtherVT SETNE), (OtherVT SETUNE)]>; +// FIXME: Should not need code predicate +//def COND_NULL : PatLeaf<(OtherVT null_frag)>; def COND_NULL : PatLeaf < (cond), [{(void)N; return false;}] @@ -335,17 +303,17 @@ def TEX_SHADOW_ARRAY : PatLeaf< // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// +def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + class AddressSpaceList<list<int> AS> { list<int> AddrSpaces = AS; } -class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAlignment() % 8 == 0; -}]>; - -class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAlignment() >= 16; -}]>; +class Aligned<int Bytes> { + int MinAlignment = Bytes; +} class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>; @@ -502,6 +470,35 @@ defm atomic_store_#as : binary_atomic_op<atomic_store>; } // End foreach AddrSpace +multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { + foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { + let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { + defm "_"#as : binary_atomic_op<atomic_op, IsInt>; + + let PredicateCode = [{return (SDValue(N, 0).use_empty());}] in { + defm "_"#as#"_noret" : binary_atomic_op<atomic_op, IsInt>; + } + + let PredicateCode = [{return !(SDValue(N, 0).use_empty());}] in { + defm "_"#as#"_ret" : binary_atomic_op<atomic_op, IsInt>; + } + } + } +} + +defm atomic_swap : ret_noret_binary_atomic_op<atomic_swap>; +defm atomic_load_add : ret_noret_binary_atomic_op<atomic_load_add>; +defm atomic_load_and : ret_noret_binary_atomic_op<atomic_load_and>; +defm atomic_load_max : ret_noret_binary_atomic_op<atomic_load_max>; +defm atomic_load_min : ret_noret_binary_atomic_op<atomic_load_min>; +defm atomic_load_or : ret_noret_binary_atomic_op<atomic_load_or>; +defm atomic_load_sub : ret_noret_binary_atomic_op<atomic_load_sub>; +defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>; +defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>; +defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>; +defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>; + + def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress; def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress; @@ -513,21 +510,31 @@ def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress; def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress; def atomic_store_local : LocalStore <atomic_store>; -def load_align8_local : Aligned8Bytes < - (ops node:$ptr), (load_local node:$ptr) ->; -def load_align16_local : Aligned16Bytes < - (ops node:$ptr), (load_local node:$ptr) ->; +def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; + let MinAlignment = 8; +} -def store_align8_local : Aligned8Bytes < - (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) ->; +def load_align16_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; + let MinAlignment = 16; +} + +def store_align8_local: PatFrag<(ops node:$val, node:$ptr), + (store_local node:$val, node:$ptr)>, Aligned<8> { + let IsStore = 1; + let IsTruncStore = 0; +} + +def store_align16_local: PatFrag<(ops node:$val, node:$ptr), + (store_local node:$val, node:$ptr)>, Aligned<16> { + let IsStore = 1; + let IsTruncStore = 0; +} -def store_align16_local : Aligned16Bytes < - (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) ->; def atomic_store_flat : FlatStore <atomic_store>; def truncstorei8_hi16_flat : StoreHi16<truncstorei8>, FlatStoreAddress; @@ -547,69 +554,26 @@ class region_binary_atomic_op<SDNode atomic_op> : }]>; -def atomic_swap_local : local_binary_atomic_op<atomic_swap>; -def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>; -def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>; -def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>; -def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>; -def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>; -def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>; -def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>; -def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>; -def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>; -def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>; - def mskor_global : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; -class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag< - (ops node:$ptr, node:$cmp, node:$swap), - (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast<AtomicSDNode>(N); - return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; - -class AtomicCmpSwapRegion <SDNode cmp_swap_node> : PatFrag< - (ops node:$ptr, node:$cmp, node:$swap), - (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast<AtomicSDNode>(N); - return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; -}]>; +let AddressSpaces = StoreAddress_local.AddrSpaces in { +defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; +} -def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>; +let AddressSpaces = StoreAddress_region.AddrSpaces in { +defm atomic_cmp_swap_region : ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; +} class global_binary_atomic_op_frag<SDNode atomic_op> : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; -multiclass global_binary_atomic_op<SDNode atomic_op> { - def "" : global_binary_atomic_op_frag<atomic_op>; - - def _noret : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; - - def _ret : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; -} - -defm atomic_swap_global : global_binary_atomic_op<atomic_swap>; -defm atomic_add_global : global_binary_atomic_op<atomic_load_add>; -defm atomic_and_global : global_binary_atomic_op<atomic_load_and>; -defm atomic_max_global : global_binary_atomic_op<atomic_load_max>; -defm atomic_min_global : global_binary_atomic_op<atomic_load_min>; -defm atomic_or_global : global_binary_atomic_op<atomic_load_or>; -defm atomic_sub_global : global_binary_atomic_op<atomic_load_sub>; -defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>; -defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>; -defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; - // Legacy. def AMDGPUatomic_cmp_swap_global : PatFrag< (ops node:$ptr, node:$value), diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 670f6225fbf7..5aba35a19ced 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -11,6 +11,13 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// +#if defined(_MSC_VER) || defined(__MINGW32__) +// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI +// from the Visual C++ cmath / math.h headers: +// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 +#define _USE_MATH_DEFINES +#endif + #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPUTargetMachine.h" @@ -20,6 +27,7 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" @@ -32,7 +40,7 @@ using namespace LegalityPredicates; static LegalityPredicate isMultiple32(unsigned TypeIdx, - unsigned MaxSize = 512) { + unsigned MaxSize = 1024) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; const LLT EltTy = Ty.getScalarType(); @@ -40,12 +48,27 @@ static LegalityPredicate isMultiple32(unsigned TypeIdx, }; } +static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { + return [=](const LegalityQuery &Query) { + return Query.Types[TypeIdx].getSizeInBits() == Size; + }; +} + static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; return Ty.isVector() && Ty.getNumElements() % 2 != 0 && - Ty.getElementType().getSizeInBits() < 32; + Ty.getElementType().getSizeInBits() < 32 && + Ty.getSizeInBits() % 32 != 0; + }; +} + +static LegalityPredicate isWideVec16(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + const LLT EltTy = Ty.getScalarType(); + return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; }; } @@ -68,6 +91,31 @@ static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { }; } +// Increase the number of vector elements to reach the next multiple of 32-bit +// type. +static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + + const LLT EltTy = Ty.getElementType(); + const int Size = Ty.getSizeInBits(); + const int EltSize = EltTy.getSizeInBits(); + const int NextMul32 = (Size + 31) / 32; + + assert(EltSize < 32); + + const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; + return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); + }; +} + +static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { + return [=](const LegalityQuery &Query) { + const LLT QueryTy = Query.Types[TypeIdx]; + return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; + }; +} + static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { return [=](const LegalityQuery &Query) { const LLT QueryTy = Query.Types[TypeIdx]; @@ -82,7 +130,7 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { }; } -// Any combination of 32 or 64-bit elements up to 512 bits, and multiples of +// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of // v2s16. static LegalityPredicate isRegisterType(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { @@ -94,7 +142,21 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) { EltSize == 128 || EltSize == 256; } - return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; + return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; + }; +} + +static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { + return [=](const LegalityQuery &Query) { + return Query.Types[TypeIdx].getElementType() == Type; + }; +} + +static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + return !Ty.isVector() && Ty.getSizeInBits() > 32 && + Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); }; } @@ -112,9 +174,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); + const LLT S96 = LLT::scalar(96); const LLT S128 = LLT::scalar(128); const LLT S256 = LLT::scalar(256); - const LLT S512 = LLT::scalar(512); + const LLT S1024 = LLT::scalar(1024); const LLT V2S16 = LLT::vector(2, 16); const LLT V4S16 = LLT::vector(4, 16); @@ -134,6 +197,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT V14S32 = LLT::vector(14, 32); const LLT V15S32 = LLT::vector(15, 32); const LLT V16S32 = LLT::vector(16, 32); + const LLT V32S32 = LLT::vector(32, 32); const LLT V2S64 = LLT::vector(2, 64); const LLT V3S64 = LLT::vector(3, 64); @@ -142,16 +206,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT V6S64 = LLT::vector(6, 64); const LLT V7S64 = LLT::vector(7, 64); const LLT V8S64 = LLT::vector(8, 64); + const LLT V16S64 = LLT::vector(16, 64); std::initializer_list<LLT> AllS32Vectors = {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, - V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; + V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; std::initializer_list<LLT> AllS64Vectors = - {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; + {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); + const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); + const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); @@ -162,7 +229,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }; const std::initializer_list<LLT> AddrSpaces32 = { - LocalPtr, PrivatePtr + LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr }; const std::initializer_list<LLT> FPTypesBase = { @@ -216,37 +283,34 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) .clampScalar(0, S32, S64) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) + .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) .widenScalarToNextPow2(0) .scalarize(0); - getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, + getActionDefinitionsBuilder({G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) .legalFor({{S32, S1}}) - .clampScalar(0, S32, S32); + .clampScalar(0, S32, S32) + .scalarize(0); // TODO: Implement. + + getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) + .lower(); getActionDefinitionsBuilder(G_BITCAST) - .legalForCartesianProduct({S32, V2S16}) - .legalForCartesianProduct({S64, V2S32, V4S16}) - .legalForCartesianProduct({V2S64, V4S32}) // Don't worry about the size constraint. - .legalIf(all(isPointer(0), isPointer(1))); + .legalIf(all(isRegisterType(0), isRegisterType(1))) + // FIXME: Testing hack + .legalForCartesianProduct({S16, LLT::vector(2, 8), }); - if (ST.has16BitInsts()) { - getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({S32, S64, S16}) - .clampScalar(0, S16, S64); - } else { - getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({S32, S64}) - .clampScalar(0, S32, S64); - } + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({S32, S64, S16}) + .clampScalar(0, S16, S64); getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, + .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .clampScalarOrElt(0, S32, S512) + .clampScalarOrElt(0, S32, S1024) .legalIf(isMultiple32(0)) .widenScalarToNextPow2(0, 32) .clampMaxNumElements(0, S32, 16); @@ -256,23 +320,33 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // values may not be legal. We need to figure out how to distinguish // between these two scenarios. getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({S1, S32, S64, GlobalPtr, + .legalFor({S1, S32, S64, S16, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) .clampScalar(0, S32, S64) .widenScalarToNextPow2(0) .legalIf(isPointer(0)); setAction({G_FRAME_INDEX, PrivatePtr}, Legal); + getActionDefinitionsBuilder(G_GLOBAL_VALUE) + .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); + auto &FPOpActions = getActionDefinitionsBuilder( - { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) + { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) .legalFor({S32, S64}); + auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) + .customFor({S32, S64}); + auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) + .customFor({S32, S64}); if (ST.has16BitInsts()) { if (ST.hasVOP3PInsts()) FPOpActions.legalFor({S16, V2S16}); else FPOpActions.legalFor({S16}); + + TrigActions.customFor({S16}); + FDIVActions.customFor({S16}); } auto &MinNumMaxNum = getActionDefinitionsBuilder({ @@ -293,22 +367,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); } - // TODO: Implement - getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); - if (ST.hasVOP3PInsts()) FPOpActions.clampMaxNumElements(0, S16, 2); + FPOpActions .scalarize(0) .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); + TrigActions + .scalarize(0) + .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); + + FDIVActions + .scalarize(0) + .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); + + getActionDefinitionsBuilder({G_FNEG, G_FABS}) + .legalFor(FPTypesPK16) + .clampMaxNumElements(0, S16, 2) + .scalarize(0) + .clampScalar(0, S16, S64); + + // TODO: Implement + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); + if (ST.has16BitInsts()) { - getActionDefinitionsBuilder(G_FSQRT) + getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) .legalFor({S32, S64, S16}) .scalarize(0) .clampScalar(0, S16, S64); } else { - getActionDefinitionsBuilder(G_FSQRT) + getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) .legalFor({S32, S64}) .scalarize(0) .clampScalar(0, S32, S64); @@ -334,23 +423,43 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .clampScalar(0, S32, S64); + // Whether this is legal depends on the floating point mode for the function. + auto &FMad = getActionDefinitionsBuilder(G_FMAD); + if (ST.hasMadF16()) + FMad.customFor({S32, S16}); + else + FMad.customFor({S32}); + FMad.scalarize(0) + .lower(); + getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, {S32, S1}, {S64, S1}, {S16, S1}, + {S96, S32}, // FIXME: Hack {S64, LLT::scalar(33)}, {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) .scalarize(0); - getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) - .legalFor({{S32, S32}, {S64, S32}}) + // TODO: Split s1->s64 during regbankselect for VALU. + auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) + .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}}) .lowerFor({{S32, S64}}) - .customFor({{S64, S64}}) - .scalarize(0); + .customFor({{S64, S64}}); + if (ST.has16BitInsts()) + IToFP.legalFor({{S16, S16}}); + IToFP.clampScalar(1, S32, S64) + .scalarize(0); + + auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) + .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); + if (ST.has16BitInsts()) + FPToI.legalFor({{S16, S16}}); + else + FPToI.minScalar(1, S32); - getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) - .legalFor({{S32, S32}, {S32, S64}}) - .scalarize(0); + FPToI.minScalar(0, S32) + .scalarize(0); getActionDefinitionsBuilder(G_INTRINSIC_ROUND) .legalFor({S32, S64}) @@ -374,6 +483,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalForCartesianProduct(AddrSpaces32, {S32}) .scalarize(0); + getActionDefinitionsBuilder(G_PTR_MASK) + .scalarize(0) + .alwaysLegal(); + setAction({G_BLOCK_ADDR, CodePtr}, Legal); auto &CmpBuilder = @@ -415,7 +528,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(1, 32); // TODO: Expand for > s32 - getActionDefinitionsBuilder(G_BSWAP) + getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) .legalFor({S32}) .clampScalar(0, S32, S32) .scalarize(0); @@ -491,87 +604,239 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); }); - if (ST.hasFlatAddressSpace()) { - getActionDefinitionsBuilder(G_ADDRSPACE_CAST) - .scalarize(0) - .custom(); - } + getActionDefinitionsBuilder(G_ADDRSPACE_CAST) + .scalarize(0) + .custom(); // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we // handle some operations by just promoting the register during // selection. There are also d16 loads on GFX9+ which preserve the high bits. - getActionDefinitionsBuilder({G_LOAD, G_STORE}) - .narrowScalarIf([](const LegalityQuery &Query) { - unsigned Size = Query.Types[0].getSizeInBits(); - unsigned MemSize = Query.MMODescrs[0].SizeInBits; - return (Size > 32 && MemSize < Size); - }, - [](const LegalityQuery &Query) { - return std::make_pair(0, LLT::scalar(32)); - }) - .fewerElementsIf([=](const LegalityQuery &Query) { - unsigned MemSize = Query.MMODescrs[0].SizeInBits; - return (MemSize == 96) && - Query.Types[0].isVector() && - !ST.hasDwordx3LoadStores(); - }, - [=](const LegalityQuery &Query) { - return std::make_pair(0, V2S32); - }) - .legalIf([=](const LegalityQuery &Query) { - const LLT &Ty0 = Query.Types[0]; - - unsigned Size = Ty0.getSizeInBits(); - unsigned MemSize = Query.MMODescrs[0].SizeInBits; - if (Size < 32 || (Size > 32 && MemSize < Size)) - return false; - - if (Ty0.isVector() && Size != MemSize) - return false; - - // TODO: Decompose private loads into 4-byte components. - // TODO: Illegal flat loads on SI - switch (MemSize) { - case 8: - case 16: - return Size == 32; - case 32: - case 64: - case 128: - return true; + auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { + switch (AS) { + // FIXME: Private element size. + case AMDGPUAS::PRIVATE_ADDRESS: + return 32; + // FIXME: Check subtarget + case AMDGPUAS::LOCAL_ADDRESS: + return ST.useDS128() ? 128 : 64; + + // Treat constant and global as identical. SMRD loads are sometimes usable + // for global loads (ideally constant address space should be eliminated) + // depending on the context. Legality cannot be context dependent, but + // RegBankSelect can split the load as necessary depending on the pointer + // register bank/uniformity and if the memory is invariant or not written in + // a kernel. + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: + return 512; + default: + return 128; + } + }; - case 96: - return ST.hasDwordx3LoadStores(); - - case 256: - case 512: - // TODO: Possibly support loads of i256 and i512 . This will require - // adding i256 and i512 types to MVT in order for to be able to use - // TableGen. - // TODO: Add support for other vector types, this will require - // defining more value mappings for the new types. - return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || - Ty0.getScalarType().getSizeInBits() == 64); - - default: - return false; - } - }) - .clampScalar(0, S32, S64); + const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { + const LLT DstTy = Query.Types[0]; + + // Split vector extloads. + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) + return true; + + const LLT PtrTy = Query.Types[1]; + unsigned AS = PtrTy.getAddressSpace(); + if (MemSize > maxSizeForAddrSpace(AS)) + return true; + + // Catch weird sized loads that don't evenly divide into the access sizes + // TODO: May be able to widen depending on alignment etc. + unsigned NumRegs = MemSize / 32; + if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) + return true; + + unsigned Align = Query.MMODescrs[0].AlignInBits; + if (Align < MemSize) { + const SITargetLowering *TLI = ST.getTargetLowering(); + return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); + } + + return false; + }; + unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; + unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; + unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; + + // TODO: Refine based on subtargets which support unaligned access or 128-bit + // LDS + // TODO: Unsupported flat for SI. + + for (unsigned Op : {G_LOAD, G_STORE}) { + const bool IsStore = Op == G_STORE; + + auto &Actions = getActionDefinitionsBuilder(Op); + // Whitelist the common cases. + // TODO: Pointer loads + // TODO: Wide constant loads + // TODO: Only CI+ has 3x loads + // TODO: Loads to s16 on gfx9 + Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, + {V2S32, GlobalPtr, 64, GlobalAlign32}, + {V3S32, GlobalPtr, 96, GlobalAlign32}, + {S96, GlobalPtr, 96, GlobalAlign32}, + {V4S32, GlobalPtr, 128, GlobalAlign32}, + {S128, GlobalPtr, 128, GlobalAlign32}, + {S64, GlobalPtr, 64, GlobalAlign32}, + {V2S64, GlobalPtr, 128, GlobalAlign32}, + {V2S16, GlobalPtr, 32, GlobalAlign32}, + {S32, GlobalPtr, 8, GlobalAlign8}, + {S32, GlobalPtr, 16, GlobalAlign16}, + + {S32, LocalPtr, 32, 32}, + {S64, LocalPtr, 64, 32}, + {V2S32, LocalPtr, 64, 32}, + {S32, LocalPtr, 8, 8}, + {S32, LocalPtr, 16, 16}, + {V2S16, LocalPtr, 32, 32}, + + {S32, PrivatePtr, 32, 32}, + {S32, PrivatePtr, 8, 8}, + {S32, PrivatePtr, 16, 16}, + {V2S16, PrivatePtr, 32, 32}, + + {S32, FlatPtr, 32, GlobalAlign32}, + {S32, FlatPtr, 16, GlobalAlign16}, + {S32, FlatPtr, 8, GlobalAlign8}, + {V2S16, FlatPtr, 32, GlobalAlign32}, + + {S32, ConstantPtr, 32, GlobalAlign32}, + {V2S32, ConstantPtr, 64, GlobalAlign32}, + {V3S32, ConstantPtr, 96, GlobalAlign32}, + {V4S32, ConstantPtr, 128, GlobalAlign32}, + {S64, ConstantPtr, 64, GlobalAlign32}, + {S128, ConstantPtr, 128, GlobalAlign32}, + {V2S32, ConstantPtr, 32, GlobalAlign32}}); + Actions + .customIf(typeIs(1, Constant32Ptr)) + .narrowScalarIf( + [=](const LegalityQuery &Query) -> bool { + return !Query.Types[0].isVector() && needToSplitLoad(Query); + }, + [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { + const LLT DstTy = Query.Types[0]; + const LLT PtrTy = Query.Types[1]; + + const unsigned DstSize = DstTy.getSizeInBits(); + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + + // Split extloads. + if (DstSize > MemSize) + return std::make_pair(0, LLT::scalar(MemSize)); + + if (DstSize > 32 && (DstSize % 32 != 0)) { + // FIXME: Need a way to specify non-extload of larger size if + // suitably aligned. + return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); + } + + unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); + if (MemSize > MaxSize) + return std::make_pair(0, LLT::scalar(MaxSize)); + + unsigned Align = Query.MMODescrs[0].AlignInBits; + return std::make_pair(0, LLT::scalar(Align)); + }) + .fewerElementsIf( + [=](const LegalityQuery &Query) -> bool { + return Query.Types[0].isVector() && needToSplitLoad(Query); + }, + [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { + const LLT DstTy = Query.Types[0]; + const LLT PtrTy = Query.Types[1]; + + LLT EltTy = DstTy.getElementType(); + unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); + + // Split if it's too large for the address space. + if (Query.MMODescrs[0].SizeInBits > MaxSize) { + unsigned NumElts = DstTy.getNumElements(); + unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; + + // FIXME: Refine when odd breakdowns handled + // The scalars will need to be re-legalized. + if (NumPieces == 1 || NumPieces >= NumElts || + NumElts % NumPieces != 0) + return std::make_pair(0, EltTy); + + return std::make_pair(0, + LLT::vector(NumElts / NumPieces, EltTy)); + } + + // Need to split because of alignment. + unsigned Align = Query.MMODescrs[0].AlignInBits; + unsigned EltSize = EltTy.getSizeInBits(); + if (EltSize > Align && + (EltSize / Align < DstTy.getNumElements())) { + return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); + } + + // May need relegalization for the scalars. + return std::make_pair(0, EltTy); + }) + .minScalar(0, S32); + + if (IsStore) + Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); + + // TODO: Need a bitcast lower option? + Actions + .legalIf([=](const LegalityQuery &Query) { + const LLT Ty0 = Query.Types[0]; + unsigned Size = Ty0.getSizeInBits(); + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + unsigned Align = Query.MMODescrs[0].AlignInBits; + + // No extending vector loads. + if (Size > MemSize && Ty0.isVector()) + return false; + + // FIXME: Widening store from alignment not valid. + if (MemSize < Size) + MemSize = std::max(MemSize, Align); + + switch (MemSize) { + case 8: + case 16: + return Size == 32; + case 32: + case 64: + case 128: + return true; + case 96: + return ST.hasDwordx3LoadStores(); + case 256: + case 512: + return true; + default: + return false; + } + }) + .widenScalarToNextPow2(0) + // TODO: v3s32->v4s32 with alignment + .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); + } - // FIXME: Handle alignment requirements. auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) - .legalForTypesWithMemDesc({ - {S32, GlobalPtr, 8, 8}, - {S32, GlobalPtr, 16, 8}, - {S32, LocalPtr, 8, 8}, - {S32, LocalPtr, 16, 8}, - {S32, PrivatePtr, 8, 8}, - {S32, PrivatePtr, 16, 8}}); + .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, + {S32, GlobalPtr, 16, 2 * 8}, + {S32, LocalPtr, 8, 8}, + {S32, LocalPtr, 16, 16}, + {S32, PrivatePtr, 8, 8}, + {S32, PrivatePtr, 16, 16}, + {S32, ConstantPtr, 8, 8}, + {S32, ConstantPtr, 16, 2 * 8}}); if (ST.hasFlatAddressSpace()) { - ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, - {S32, FlatPtr, 16, 8}}); + ExtLoads.legalForTypesWithMemDesc( + {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); } ExtLoads.clampScalar(0, S32, S32) @@ -590,6 +855,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } + getActionDefinitionsBuilder(G_ATOMICRMW_FADD) + .legalFor({{S32, LocalPtr}}); + + getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) + .lower(); + // TODO: Pointer types, any 32-bit or 64-bit vector getActionDefinitionsBuilder(G_SELECT) .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, @@ -643,7 +914,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return (EltTy.getSizeInBits() == 16 || EltTy.getSizeInBits() % 32 == 0) && VecTy.getSizeInBits() % 32 == 0 && - VecTy.getSizeInBits() <= 512 && + VecTy.getSizeInBits() <= 1024 && IdxTy.getSizeInBits() == 32; }) .clampScalar(EltTypeIdx, S32, S64) @@ -663,6 +934,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // FIXME: Doesn't handle extract of illegal sizes. getActionDefinitionsBuilder(Op) + .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) + // FIXME: Multiples of 16 should not be legal. .legalIf([=](const LegalityQuery &Query) { const LLT BigTy = Query.Types[BigTyIdx]; const LLT LitTy = Query.Types[LitTyIdx]; @@ -686,18 +959,36 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } - getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalForCartesianProduct(AllS32Vectors, {S32}) - .legalForCartesianProduct(AllS64Vectors, {S64}) - .clampNumElements(0, V16S32, V16S32) - .clampNumElements(0, V2S64, V8S64) - .minScalarSameAs(1, 0) - .legalIf(isRegisterType(0)) - .minScalarOrElt(0, S32); + auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) + .legalForCartesianProduct(AllS32Vectors, {S32}) + .legalForCartesianProduct(AllS64Vectors, {S64}) + .clampNumElements(0, V16S32, V32S32) + .clampNumElements(0, V2S64, V16S64) + .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); + + if (ST.hasScalarPackInsts()) + BuildVector.legalFor({V2S16, S32}); + + BuildVector + .minScalarSameAs(1, 0) + .legalIf(isRegisterType(0)) + .minScalarOrElt(0, S32); + + if (ST.hasScalarPackInsts()) { + getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) + .legalFor({V2S16, S32}) + .lower(); + } else { + getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) + .lower(); + } getActionDefinitionsBuilder(G_CONCAT_VECTORS) .legalIf(isRegisterType(0)); + // TODO: Don't fully scalarize v2s16 pieces + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); + // Merge/Unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; @@ -715,14 +1006,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return false; }; - getActionDefinitionsBuilder(Op) + auto &Builder = getActionDefinitionsBuilder(Op) .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) // Clamp the little scalar to s8-s256 and make it a power of 2. It's not // worth considering the multiples of 64 since 2*192 and 2*384 are not // valid. .clampScalar(LitTyIdx, S16, S256) .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) - + .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) + .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), + elementTypeIs(1, S16)), + changeTo(1, V2S16)) // Break up vectors with weird elements into scalars .fewerElementsIf( [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, @@ -730,25 +1024,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .fewerElementsIf( [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, scalarize(1)) - .clampScalar(BigTyIdx, S32, S512) - .widenScalarIf( + .clampScalar(BigTyIdx, S32, S1024) + .lowerFor({{S16, V2S16}}); + + if (Op == G_MERGE_VALUES) { + Builder.widenScalarIf( + // TODO: Use 16-bit shifts if legal for 8-bit values? [=](const LegalityQuery &Query) { - const LLT &Ty = Query.Types[BigTyIdx]; - return !isPowerOf2_32(Ty.getSizeInBits()) && - Ty.getSizeInBits() % 16 != 0; + const LLT Ty = Query.Types[LitTyIdx]; + return Ty.getSizeInBits() < 32; }, - [=](const LegalityQuery &Query) { - // Pick the next power of 2, or a multiple of 64 over 128. - // Whichever is smaller. - const LLT &Ty = Query.Types[BigTyIdx]; - unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); - if (NewSizeInBits >= 256) { - unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); - if (RoundedTo < NewSizeInBits) - NewSizeInBits = RoundedTo; - } - return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); - }) + changeTo(LitTyIdx, S32)); + } + + Builder.widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[BigTyIdx]; + return !isPowerOf2_32(Ty.getSizeInBits()) && + Ty.getSizeInBits() % 16 != 0; + }, + [=](const LegalityQuery &Query) { + // Pick the next power of 2, or a multiple of 64 over 128. + // Whichever is smaller. + const LLT &Ty = Query.Types[BigTyIdx]; + unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); + if (NewSizeInBits >= 256) { + unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); + if (RoundedTo < NewSizeInBits) + NewSizeInBits = RoundedTo; + } + return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); + }) .legalIf([=](const LegalityQuery &Query) { const LLT &BigTy = Query.Types[BigTyIdx]; const LLT &LitTy = Query.Types[LitTyIdx]; @@ -760,43 +1066,56 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return BigTy.getSizeInBits() % 16 == 0 && LitTy.getSizeInBits() % 16 == 0 && - BigTy.getSizeInBits() <= 512; + BigTy.getSizeInBits() <= 1024; }) // Any vectors left are the wrong size. Scalarize them. .scalarize(0) .scalarize(1); } + getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + computeTables(); verify(*ST.getInstrInfo()); } bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, + MachineIRBuilder &B, GISelChangeObserver &Observer) const { switch (MI.getOpcode()) { case TargetOpcode::G_ADDRSPACE_CAST: - return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); + return legalizeAddrSpaceCast(MI, MRI, B); case TargetOpcode::G_FRINT: - return legalizeFrint(MI, MRI, MIRBuilder); + return legalizeFrint(MI, MRI, B); case TargetOpcode::G_FCEIL: - return legalizeFceil(MI, MRI, MIRBuilder); + return legalizeFceil(MI, MRI, B); case TargetOpcode::G_INTRINSIC_TRUNC: - return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); + return legalizeIntrinsicTrunc(MI, MRI, B); case TargetOpcode::G_SITOFP: - return legalizeITOFP(MI, MRI, MIRBuilder, true); + return legalizeITOFP(MI, MRI, B, true); case TargetOpcode::G_UITOFP: - return legalizeITOFP(MI, MRI, MIRBuilder, false); + return legalizeITOFP(MI, MRI, B, false); case TargetOpcode::G_FMINNUM: case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FMINNUM_IEEE: case TargetOpcode::G_FMAXNUM_IEEE: - return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); + return legalizeMinNumMaxNum(MI, MRI, B); case TargetOpcode::G_EXTRACT_VECTOR_ELT: - return legalizeExtractVectorElt(MI, MRI, MIRBuilder); + return legalizeExtractVectorElt(MI, MRI, B); case TargetOpcode::G_INSERT_VECTOR_ELT: - return legalizeInsertVectorElt(MI, MRI, MIRBuilder); + return legalizeInsertVectorElt(MI, MRI, B); + case TargetOpcode::G_FSIN: + case TargetOpcode::G_FCOS: + return legalizeSinCos(MI, MRI, B); + case TargetOpcode::G_GLOBAL_VALUE: + return legalizeGlobalValue(MI, MRI, B); + case TargetOpcode::G_LOAD: + return legalizeLoad(MI, MRI, B, Observer); + case TargetOpcode::G_FMAD: + return legalizeFMad(MI, MRI, B); + case TargetOpcode::G_FDIV: + return legalizeFDIV(MI, MRI, B); default: return false; } @@ -807,11 +1126,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, Register AMDGPULegalizerInfo::getSegmentAperture( unsigned AS, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const LLT S32 = LLT::scalar(32); + assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); + if (ST.hasApertureRegs()) { // FIXME: Use inline constants (src_{shared, private}_base) instead of // getreg. @@ -829,13 +1150,13 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Register ApertureReg = MRI.createGenericVirtualRegister(S32); Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) + B.buildInstr(AMDGPU::S_GETREG_B32) .addDef(GetReg) .addImm(Encoding); MRI.setType(GetReg, S32); - auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); - MIRBuilder.buildInstr(TargetOpcode::G_SHL) + auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); + B.buildInstr(TargetOpcode::G_SHL) .addDef(ApertureReg) .addUse(GetReg) .addUse(ShiftAmt.getReg(0)); @@ -846,8 +1167,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Register QueuePtr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); - // FIXME: Placeholder until we can track the input registers. - MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) + return Register(); // Offset into amd_queue_t for group_segment_aperture_base_hi / // private_segment_aperture_base_hi. @@ -870,18 +1192,19 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Register LoadResult = MRI.createGenericVirtualRegister(S32); Register LoadAddr; - MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); - MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); + B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); + B.buildLoad(LoadResult, LoadAddr, *MMO); return LoadResult; } bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { - MachineFunction &MF = MIRBuilder.getMF(); + MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); - MIRBuilder.setInstr(MI); + B.setInstr(MI); + const LLT S32 = LLT::scalar(32); Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); @@ -899,7 +1222,28 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { - MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); + MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); + return true; + } + + if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + // Truncate. + B.buildExtract(Dst, Src, 0); + MI.eraseFromParent(); + return true; + } + + if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + uint32_t AddrHiVal = Info->get32BitAddressHighBits(); + + // FIXME: This is a bit ugly due to creating a merge of 2 pointers to + // another. Merge operands are required to be the same type, but creating an + // extra ptrtoint would be kind of pointless. + auto HighAddr = B.buildConstant( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); + B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); + MI.eraseFromParent(); return true; } @@ -908,47 +1252,52 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( DestAS == AMDGPUAS::PRIVATE_ADDRESS); unsigned NullVal = TM.getNullPointerValue(DestAS); - auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); - auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); + auto SegmentNull = B.buildConstant(DstTy, NullVal); + auto FlatNull = B.buildConstant(SrcTy, 0); Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); // Extract low 32-bits of the pointer. - MIRBuilder.buildExtract(PtrLo32, Src, 0); + B.buildExtract(PtrLo32, Src, 0); Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); - MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); - MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); + B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); + B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); MI.eraseFromParent(); return true; } - assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || - SrcAS == AMDGPUAS::PRIVATE_ADDRESS); + if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) + return false; + + if (!ST.hasFlatAddressSpace()) + return false; auto SegmentNull = - MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); + B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); auto FlatNull = - MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); + B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); - Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); + Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); + if (!ApertureReg.isValid()) + return false; Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); - MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); + B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); // Coerce the type of the low half of the result so we can use merge_values. - Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); - MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) + Register SrcAsInt = MRI.createGenericVirtualRegister(S32); + B.buildInstr(TargetOpcode::G_PTRTOINT) .addDef(SrcAsInt) .addUse(Src); // TODO: Should we allow mismatched types but matching sizes in merges to // avoid the ptrtoint? - MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); - MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); + B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); + B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); MI.eraseFromParent(); return true; @@ -956,8 +1305,8 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( bool AMDGPULegalizerInfo::legalizeFrint( MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { - MIRBuilder.setInstr(MI); + MachineIRBuilder &B) const { + B.setInstr(MI); Register Src = MI.getOperand(1).getReg(); LLT Ty = MRI.getType(Src); @@ -966,18 +1315,18 @@ bool AMDGPULegalizerInfo::legalizeFrint( APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); - auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); - auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); + auto C1 = B.buildFConstant(Ty, C1Val); + auto CopySign = B.buildFCopysign(Ty, C1, Src); // TODO: Should this propagate fast-math-flags? - auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); - auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); + auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); + auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); - auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); - auto Fabs = MIRBuilder.buildFAbs(Ty, Src); + auto C2 = B.buildFConstant(Ty, C2Val); + auto Fabs = B.buildFAbs(Ty, Src); - auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); - MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); + auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); + B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); return true; } @@ -1124,7 +1473,7 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( MachineIRBuilder HelperBuilder(MI); GISelObserverWrapper DummyObserver; LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); - HelperBuilder.setMBB(*MI.getParent()); + HelperBuilder.setInstr(MI); return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; } @@ -1187,6 +1536,194 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( return true; } +bool AMDGPULegalizerInfo::legalizeSinCos( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(DstReg); + unsigned Flags = MI.getFlags(); + + Register TrigVal; + auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); + if (ST.hasTrigReducedRange()) { + auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); + TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) + .addUse(MulVal.getReg(0)) + .setMIFlags(Flags).getReg(0); + } else + TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); + + Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? + Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; + B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) + .addUse(TrigVal) + .setMIFlags(Flags); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( + Register DstReg, LLT PtrTy, + MachineIRBuilder &B, const GlobalValue *GV, + unsigned Offset, unsigned GAFlags) const { + // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered + // to the following code sequence: + // + // For constant address space: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // a fixup or relocation is emitted to replace $symbol with a literal + // constant, which is a pc-relative offset from the encoding of the $symbol + // operand to the global variable. + // + // For global address space: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo + // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // fixups or relocations are emitted to replace $symbol@*@lo and + // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, + // which is a 64-bit pc-relative offset from the encoding of the $symbol + // operand to the global variable. + // + // What we want here is an offset from the value returned by s_getpc + // (which is the address of the s_add_u32 instruction) to the global + // variable, but since the encoding of $symbol starts 4 bytes after the start + // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too + // small. This requires us to add 4 to the global variable offset in order to + // compute the correct address. + + LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + + Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : + B.getMRI()->createGenericVirtualRegister(ConstPtrTy); + + MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) + .addDef(PCReg); + + MIB.addGlobalAddress(GV, Offset + 4, GAFlags); + if (GAFlags == SIInstrInfo::MO_NONE) + MIB.addImm(0); + else + MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); + + B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); + + if (PtrTy.getSizeInBits() == 32) + B.buildExtract(DstReg, PCReg, 0); + return true; + } + +bool AMDGPULegalizerInfo::legalizeGlobalValue( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register DstReg = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(DstReg); + unsigned AS = Ty.getAddressSpace(); + + const GlobalValue *GV = MI.getOperand(1).getGlobal(); + MachineFunction &MF = B.getMF(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + B.setInstr(MI); + + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { + if (!MFI->isEntryFunction()) { + const Function &Fn = MF.getFunction(); + DiagnosticInfoUnsupported BadLDSDecl( + Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); + Fn.getContext().diagnose(BadLDSDecl); + } + + // TODO: We could emit code to handle the initialization somewhere. + if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { + B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); + MI.eraseFromParent(); + return true; + } + + const Function &Fn = MF.getFunction(); + DiagnosticInfoUnsupported BadInit( + Fn, "unsupported initializer for address space", MI.getDebugLoc()); + Fn.getContext().diagnose(BadInit); + return true; + } + + const SITargetLowering *TLI = ST.getTargetLowering(); + + if (TLI->shouldEmitFixup(GV)) { + buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); + MI.eraseFromParent(); + return true; + } + + if (TLI->shouldEmitPCReloc(GV)) { + buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); + MI.eraseFromParent(); + return true; + } + + LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); + + MachineMemOperand *GOTMMO = MF.getMachineMemOperand( + MachinePointerInfo::getGOT(MF), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + 8 /*Size*/, 8 /*Align*/); + + buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); + + if (Ty.getSizeInBits() == 32) { + // Truncate if this is a 32-bit constant adrdess. + auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); + B.buildExtract(DstReg, Load, 0); + } else + B.buildLoad(DstReg, GOTAddr, *GOTMMO); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeLoad( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, GISelChangeObserver &Observer) const { + B.setInstr(MI); + LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(Cast.getReg(0)); + Observer.changedInstr(MI); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFMad( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + assert(Ty.isScalar()); + + // TODO: Always legal with future ftz flag. + if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) + return true; + if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) + return true; + + MachineFunction &MF = B.getMF(); + + MachineIRBuilder HelperBuilder(MI); + GISelObserverWrapper DummyObserver; + LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); + HelperBuilder.setMBB(*MI.getParent()); + return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; +} + // Return the use branch instruction, otherwise null if the usage is invalid. static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI) { @@ -1212,10 +1749,9 @@ Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg) const { - if (!Arg->isRegister()) + if (!Arg->isRegister() || !Arg->getRegister().isValid()) return false; // TODO: Handle these - assert(Arg->getRegister() != 0); assert(Arg->getRegister().isPhysical()); MachineRegisterInfo &MRI = *B.getMRI(); @@ -1229,19 +1765,30 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, const unsigned Mask = Arg->getMask(); const unsigned Shift = countTrailingZeros<unsigned>(Mask); - auto ShiftAmt = B.buildConstant(S32, Shift); - auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); - B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); + Register AndMaskSrc = LiveIn; + + if (Shift != 0) { + auto ShiftAmt = B.buildConstant(S32, Shift); + AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); + } + + B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); } else B.buildCopy(DstReg, LiveIn); // Insert the argument copy if it doens't already exist. // FIXME: It seems EmitLiveInCopies isn't called anywhere? if (!MRI.getVRegDef(LiveIn)) { + // FIXME: Should have scoped insert pt + MachineBasicBlock &OrigInsBB = B.getMBB(); + auto OrigInsPt = B.getInsertPt(); + MachineBasicBlock &EntryMBB = B.getMF().front(); EntryMBB.addLiveIn(Arg->getRegister()); B.setInsertPt(EntryMBB, EntryMBB.begin()); B.buildCopy(LiveIn, Arg->getRegister()); + + B.setInsertPt(OrigInsBB, OrigInsPt); } return true; @@ -1272,6 +1819,113 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( return false; } +bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + + if (legalizeFastUnsafeFDIV(MI, MRI, B)) + return true; + + return false; +} + +bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + + uint16_t Flags = MI.getFlags(); + + LLT ResTy = MRI.getType(Res); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + + const MachineFunction &MF = B.getMF(); + bool Unsafe = + MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); + + if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) + return false; + + if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals()) + return false; + + if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { + // 1 / x -> RCP(x) + if (CLHS->isExactlyValue(1.0)) { + B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) + .addUse(RHS) + .setMIFlags(Flags); + + MI.eraseFromParent(); + return true; + } + + // -1 / x -> RCP( FNEG(x) ) + if (CLHS->isExactlyValue(-1.0)) { + auto FNeg = B.buildFNeg(ResTy, RHS, Flags); + B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) + .addUse(FNeg.getReg(0)) + .setMIFlags(Flags); + + MI.eraseFromParent(); + return true; + } + } + + // x / y -> x * (1.0 / y) + if (Unsafe) { + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) + .addUse(RHS) + .setMIFlags(Flags); + B.buildFMul(Res, LHS, RCP, Flags); + + MI.eraseFromParent(); + return true; + } + + return false; +} + +bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(2).getReg(); + Register RHS = MI.getOperand(3).getReg(); + uint16_t Flags = MI.getFlags(); + + LLT S32 = LLT::scalar(32); + LLT S1 = LLT::scalar(1); + + auto Abs = B.buildFAbs(S32, RHS, Flags); + const APFloat C0Val(1.0f); + + auto C0 = B.buildConstant(S32, 0x6f800000); + auto C1 = B.buildConstant(S32, 0x2f800000); + auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); + + auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); + auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); + + auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); + + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) + .addUse(Mul0.getReg(0)) + .setMIFlags(Flags); + + auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); + + B.buildFMul(Res, Sel, Mul1, Flags); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1306,11 +1960,79 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, return true; } +bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + unsigned AddrSpace) const { + B.setInstr(MI); + Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); + auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); + B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); + MI.eraseFromParent(); + return true; +} + +/// Handle register layout difference for f16 images for some subtargets. +Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, + MachineRegisterInfo &MRI, + Register Reg) const { + if (!ST.hasUnpackedD16VMem()) + return Reg; + + const LLT S16 = LLT::scalar(16); + const LLT S32 = LLT::scalar(32); + LLT StoreVT = MRI.getType(Reg); + assert(StoreVT.isVector() && StoreVT.getElementType() == S16); + + auto Unmerge = B.buildUnmerge(S16, Reg); + + SmallVector<Register, 4> WideRegs; + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); + + int NumElts = StoreVT.getNumElements(); + + return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); +} + +bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + bool IsFormat) const { + // TODO: Reject f16 format on targets where unsupported. + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(VData); + + B.setInstr(MI); + + const LLT S32 = LLT::scalar(32); + const LLT S16 = LLT::scalar(16); + + // Fixup illegal register types for i8 stores. + if (Ty == LLT::scalar(8) || Ty == S16) { + Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); + MI.getOperand(1).setReg(AnyExt); + return true; + } + + if (Ty.isVector()) { + if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { + if (IsFormat) + MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); + return true; + } + + return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; + } + + return Ty == S32; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { // Replace the use G_BRCOND with the exec manipulate and branch pseudos. - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_if: { if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { const SIRegisterInfo *TRI @@ -1386,6 +2108,22 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, case Intrinsic::amdgcn_dispatch_id: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::DISPATCH_ID); + case Intrinsic::amdgcn_fdiv_fast: + return legalizeFDIVFastIntrin(MI, MRI, B); + case Intrinsic::amdgcn_is_shared: + return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); + case Intrinsic::amdgcn_is_private: + return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); + case Intrinsic::amdgcn_wavefrontsize: { + B.setInstr(MI); + B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); + MI.eraseFromParent(); + return true; + } + case Intrinsic::amdgcn_raw_buffer_store: + return legalizeRawBufferStore(MI, MRI, B, false); + case Intrinsic::amdgcn_raw_buffer_store_format: + return legalizeRawBufferStore(MI, MRI, B, true); default: return true; } diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 3f1cc1d265dd..d0fba23a8686 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -16,6 +16,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "AMDGPUArgumentUsageInfo.h" +#include "SIInstrInfo.h" namespace llvm { @@ -32,29 +33,44 @@ public: const GCNTargetMachine &TM); bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, + MachineIRBuilder &B, GISelChangeObserver &Observer) const override; Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, bool Signed) const; + MachineIRBuilder &B, bool Signed) const; bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const; + MachineIRBuilder &B) const; + bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + + bool buildPCRelGlobalAddress( + Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, + unsigned Offset, unsigned GAFlags = SIInstrInfo::MO_NONE) const; + + bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeLoad(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, + GISelChangeObserver &Observer) const; + + bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; Register getLiveInRegister(MachineRegisterInfo &MRI, Register Reg, LLT Ty) const; @@ -65,10 +81,24 @@ public: MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, unsigned AddrSpace) const; + + Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, + Register Reg) const; + bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool IsFormat) const; bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const override; + MachineIRBuilder &B) const override; }; } // End llvm namespace. diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp index ce0a9db7c7f4..2c94e0046651 100644 --- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/ValueSymbolTable.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -48,18 +49,10 @@ static cl::list<std::string> UseNative("amdgpu-use-native", cl::CommaSeparated, cl::ValueOptional, cl::Hidden); -#define MATH_PI 3.14159265358979323846264338327950288419716939937511 -#define MATH_E 2.71828182845904523536028747135266249775724709369996 -#define MATH_SQRT2 1.41421356237309504880168872420969807856967187537695 - -#define MATH_LOG2E 1.4426950408889634073599246810018921374266459541529859 -#define MATH_LOG10E 0.4342944819032518276511289189166050822943970058036665 -// Value of log2(10) -#define MATH_LOG2_10 3.3219280948873623478703194294893901758648313930245806 -// Value of 1 / log2(10) -#define MATH_RLOG2_10 0.3010299956639811952137388947244930267681898814621085 -// Value of 1 / M_LOG2E_F = 1 / log2(e) -#define MATH_RLOG2_E 0.6931471805599453094172321214581765680755001343602552 +#define MATH_PI numbers::pi +#define MATH_E numbers::e +#define MATH_SQRT2 numbers::sqrt2 +#define MATH_SQRT1_2 numbers::inv_sqrt2 namespace llvm { @@ -254,8 +247,8 @@ struct TableEntry { /* a list of {result, input} */ static const TableEntry tbl_acos[] = { - {MATH_PI/2.0, 0.0}, - {MATH_PI/2.0, -0.0}, + {MATH_PI / 2.0, 0.0}, + {MATH_PI / 2.0, -0.0}, {0.0, 1.0}, {MATH_PI, -1.0} }; @@ -271,8 +264,8 @@ static const TableEntry tbl_acospi[] = { static const TableEntry tbl_asin[] = { {0.0, 0.0}, {-0.0, -0.0}, - {MATH_PI/2.0, 1.0}, - {-MATH_PI/2.0, -1.0} + {MATH_PI / 2.0, 1.0}, + {-MATH_PI / 2.0, -1.0} }; static const TableEntry tbl_asinh[] = { {0.0, 0.0}, @@ -287,8 +280,8 @@ static const TableEntry tbl_asinpi[] = { static const TableEntry tbl_atan[] = { {0.0, 0.0}, {-0.0, -0.0}, - {MATH_PI/4.0, 1.0}, - {-MATH_PI/4.0, -1.0} + {MATH_PI / 4.0, 1.0}, + {-MATH_PI / 4.0, -1.0} }; static const TableEntry tbl_atanh[] = { {0.0, 0.0}, @@ -359,7 +352,7 @@ static const TableEntry tbl_log10[] = { }; static const TableEntry tbl_rsqrt[] = { {1.0, 1.0}, - {1.0/MATH_SQRT2, 2.0} + {MATH_SQRT1_2, 2.0} }; static const TableEntry tbl_sin[] = { {0.0, 0.0}, @@ -868,7 +861,7 @@ static double log2(double V) { #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L return ::log2(V); #else - return log(V) / 0.693147180559945309417; + return log(V) / numbers::ln2; #endif } } @@ -1430,8 +1423,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B, B.SetInsertPoint(&*ItNew); AllocaInst *Alloc = B.CreateAlloca(RetType, 0, std::string(prefix) + UI->getName()); - Alloc->setAlignment(UCallee->getParent()->getDataLayout() - .getTypeAllocSize(RetType)); + Alloc->setAlignment(MaybeAlign( + UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType))); return Alloc; } diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp index a5bac25701a0..e1ae496d9cbc 100644 --- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -55,7 +55,7 @@ enum EManglingParam { }; struct ManglingRule { - StringRef const Name; + const char *Name; unsigned char Lead[2]; unsigned char Param[5]; @@ -69,7 +69,7 @@ struct ManglingRule { // Information about library functions with unmangled names. class UnmangledFuncInfo { - StringRef const Name; + const char *Name; unsigned NumArgs; // Table for all lib functions with unmangled names. @@ -82,7 +82,7 @@ class UnmangledFuncInfo { public: using ID = AMDGPULibFunc::EFuncId; - UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs) + constexpr UnmangledFuncInfo(const char *_Name, unsigned _NumArgs) : Name(_Name), NumArgs(_NumArgs) {} // Get index to Table by function name. static bool lookup(StringRef Name, ID &Id); @@ -133,8 +133,8 @@ unsigned ManglingRule::getNumArgs() const { // E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of // prev lead type, etc. see ParamIterator::getNextParam() for details. -static const ManglingRule manglingRules[] = { -{ StringRef(), {0}, {0} }, +static constexpr ManglingRule manglingRules[] = { +{ "", {0}, {0} }, { "abs" , {1}, {E_ANY}}, { "abs_diff" , {1}, {E_ANY,E_COPY}}, { "acos" , {1}, {E_ANY}}, @@ -682,9 +682,9 @@ bool AMDGPULibFunc::parse(StringRef FuncName, AMDGPULibFunc &F) { } if (eatTerm(FuncName, "_Z")) - F.Impl = make_unique<AMDGPUMangledLibFunc>(); + F.Impl = std::make_unique<AMDGPUMangledLibFunc>(); else - F.Impl = make_unique<AMDGPUUnmangledLibFunc>(); + F.Impl = std::make_unique<AMDGPUUnmangledLibFunc>(); if (F.Impl->parseFuncName(FuncName)) return true; diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 5dd5b3691e0a..e64542a395f0 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -72,10 +72,10 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { BasicBlock &EntryBlock = *F.begin(); IRBuilder<> Builder(&*EntryBlock.begin()); - const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary + const Align KernArgBaseAlign(16); // FIXME: Increase if necessary const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); - unsigned MaxAlign; + Align MaxAlign; // FIXME: Alignment is broken broken with explicit arg offset.; const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); if (TotalKernArgSize == 0) @@ -94,12 +94,12 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { for (Argument &Arg : F.args()) { Type *ArgTy = Arg.getType(); - unsigned Align = DL.getABITypeAlignment(ArgTy); + unsigned ABITypeAlign = DL.getABITypeAlignment(ArgTy); unsigned Size = DL.getTypeSizeInBits(ArgTy); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); - uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset; - ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; + uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; if (Arg.use_empty()) continue; @@ -128,8 +128,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { int64_t AlignDownOffset = alignDown(EltOffset, 4); int64_t OffsetDiff = EltOffset - AlignDownOffset; - unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset, - KernArgBaseAlign); + Align AdjustedAlign = commonAlignment( + KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset); Value *ArgPtr; Type *AdjustedArgTy; @@ -160,7 +160,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast"); LoadInst *Load = - Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); + Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign.value()); Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); MDBuilder MDB(Ctx); @@ -220,8 +220,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { } KernArgSegment->addAttribute( - AttributeList::ReturnIndex, - Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); + AttributeList::ReturnIndex, + Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); return true; } diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index ae4c32c258a7..3760aed87a43 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -211,6 +211,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { lowerOperand(MO, MCOp); OutMI.addOperand(MCOp); } + + int FIIdx = AMDGPU::getNamedOperandIdx(MCOpcode, AMDGPU::OpName::fi); + if (FIIdx >= (int)OutMI.getNumOperands()) + OutMI.addOperand(MCOperand::createImm(0)); } bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO, diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 237490957058..ba72f71f4322 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -694,7 +694,7 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, const MachineRegisterInfo *MRI, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo) { - if (TRI->isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n"); // If this is a source register to a PHI we are chaining, it @@ -734,7 +734,7 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg, const MachineRegisterInfo *MRI, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo) { - if (TRI->isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n"); for (auto &UI : MRI->use_operands(Reg)) { @@ -949,7 +949,7 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister, (IncludeLoopPHI && IsLoopPHI); if (ShouldReplace) { - if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { + if (Register::isPhysicalRegister(NewRegister)) { LLVM_DEBUG(dbgs() << "Trying to substitute physical register: " << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); @@ -1016,13 +1016,15 @@ bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) { // before are no longer register kills. void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) { const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + (void)TRI; // It's used by LLVM_DEBUG. + for (auto MBBI : MBBs) { MachineBasicBlock *MBB = MBBI; for (auto &II : *MBB) { for (auto &RI : II.uses()) { if (RI.isReg()) { - unsigned Reg = RI.getReg(); - if (TRI->isVirtualRegister(Reg)) { + Register Reg = RI.getReg(); + if (Register::isVirtualRegister(Reg)) { if (hasNoDef(Reg, MRI)) continue; if (!MRI->hasOneDef(Reg)) { @@ -1402,7 +1404,7 @@ void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest( unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo( MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) { unsigned DestReg = getPHIDestReg(PHI); - unsigned LinearizeDestReg = + Register LinearizeDestReg = MRI->createVirtualRegister(MRI->getRegClass(DestReg)); PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc()); storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices); @@ -1890,7 +1892,7 @@ void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled( if (!Cond[0].isReg()) return; - unsigned CondReg = Cond[0].getReg(); + Register CondReg = Cond[0].getReg(); for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) { (*UI).setIsKill(false); } @@ -1929,8 +1931,8 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co BBSelectReg, TrueBB->getNumber()); } else { const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg); - unsigned TrueBBReg = MRI->createVirtualRegister(RegClass); - unsigned FalseBBReg = MRI->createVirtualRegister(RegClass); + Register TrueBBReg = MRI->createVirtualRegister(RegClass); + Register FalseBBReg = MRI->createVirtualRegister(RegClass); TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, TrueBBReg, TrueBB->getNumber()); TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, @@ -1996,7 +1998,7 @@ void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB, InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI); } const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg); - unsigned NextDestReg = MRI->createVirtualRegister(RegClass); + Register NextDestReg = MRI->createVirtualRegister(RegClass); bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1; LLVM_DEBUG(dbgs() << "Insert Chained PHI\n"); insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg, @@ -2056,8 +2058,8 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, // register, unless it is the outgoing BB select register. We have // already creaed phi nodes for these. const TargetRegisterClass *RegClass = MRI->getRegClass(Reg); - unsigned PHIDestReg = MRI->createVirtualRegister(RegClass); - unsigned IfSourceReg = MRI->createVirtualRegister(RegClass); + Register PHIDestReg = MRI->createVirtualRegister(RegClass); + Register IfSourceReg = MRI->createVirtualRegister(RegClass); // Create initializer, this value is never used, but is needed // to satisfy SSA. LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n"); @@ -2172,7 +2174,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent(); const TargetRegisterClass *RegClass = MRI->getRegClass(CurrentBackedgeReg); - unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass); + Register NewBackedgeReg = MRI->createVirtualRegister(RegClass); MachineInstrBuilder BackedgePHI = BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL, TII->get(TargetOpcode::PHI), NewBackedgeReg); @@ -2230,7 +2232,7 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register, I != E;) { MachineOperand &O = *I; ++I; - if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { + if (Register::isPhysicalRegister(NewRegister)) { LLVM_DEBUG(dbgs() << "Trying to substitute physical register: " << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); @@ -2309,7 +2311,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( } else { // Handle internal block. const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn); - unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass); + Register CodeBBSelectReg = MRI->createVirtualRegister(RegClass); rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg); bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB; MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB, @@ -2446,7 +2448,7 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI, } const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest); - unsigned NewDestReg = MRI->createVirtualRegister(RegClass); + Register NewDestReg = MRI->createVirtualRegister(RegClass); LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI); MachineInstrBuilder MIB = BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(), @@ -2734,9 +2736,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { } const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI()); unsigned InReg = LRegion->getBBSelectRegIn(); - unsigned InnerSelectReg = + Register InnerSelectReg = MRI->createVirtualRegister(MRI->getRegClass(InReg)); - unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg)); + Register NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg)); TII->materializeImmediate(*(LRegion->getEntry()), LRegion->getEntry()->getFirstTerminator(), DL, NewInReg, Region->getEntry()->getNumber()); diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 0d3a1f1a769f..89ca702f577d 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -17,7 +17,6 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), LocalMemoryObjects(), ExplicitKernArgSize(0), - MaxKernArgAlign(0), LDSSize(0), IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath), diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 52987e2fa411..9818ab1ef148 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -23,7 +23,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { protected: uint64_t ExplicitKernArgSize; // Cache for this. - unsigned MaxKernArgAlign; // Cache for this. + Align MaxKernArgAlign; // Cache for this. /// Number of bytes in the LDS that are being used. unsigned LDSSize; @@ -47,9 +47,7 @@ public: return ExplicitKernArgSize; } - unsigned getMaxKernArgAlign() const { - return MaxKernArgAlign; - } + unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); } unsigned getLDSSize() const { return LDSSize; diff --git a/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp new file mode 100644 index 000000000000..5250bf455d71 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -0,0 +1,592 @@ +//=== AMDGPUPrintfRuntimeBinding.cpp - OpenCL printf implementation -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// \file +// +// The pass bind printfs to a kernel arg pointer that will be bound to a buffer +// later by the runtime. +// +// This pass traverses the functions in the module and converts +// each call to printf to a sequence of operations that +// store the following into the printf buffer: +// - format string (passed as a module's metadata unique ID) +// - bitwise copies of printf arguments +// The backend passes will need to store metadata in the kernel +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +using namespace llvm; + +#define DEBUG_TYPE "printfToRuntime" +#define DWORD_ALIGN 4 + +namespace { +class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding final + : public ModulePass { + +public: + static char ID; + + explicit AMDGPUPrintfRuntimeBinding(); + +private: + bool runOnModule(Module &M) override; + void getConversionSpecifiers(SmallVectorImpl<char> &OpConvSpecifiers, + StringRef fmt, size_t num_ops) const; + + bool shouldPrintAsStr(char Specifier, Type *OpType) const; + bool + lowerPrintfForGpu(Module &M, + function_ref<const TargetLibraryInfo &(Function &)> GetTLI); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + } + + Value *simplify(Instruction *I, const TargetLibraryInfo *TLI) { + return SimplifyInstruction(I, {*TD, TLI, DT}); + } + + const DataLayout *TD; + const DominatorTree *DT; + SmallVector<CallInst *, 32> Printfs; +}; +} // namespace + +char AMDGPUPrintfRuntimeBinding::ID = 0; + +INITIALIZE_PASS_BEGIN(AMDGPUPrintfRuntimeBinding, + "amdgpu-printf-runtime-binding", "AMDGPU Printf lowering", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(AMDGPUPrintfRuntimeBinding, "amdgpu-printf-runtime-binding", + "AMDGPU Printf lowering", false, false) + +char &llvm::AMDGPUPrintfRuntimeBindingID = AMDGPUPrintfRuntimeBinding::ID; + +namespace llvm { +ModulePass *createAMDGPUPrintfRuntimeBinding() { + return new AMDGPUPrintfRuntimeBinding(); +} +} // namespace llvm + +AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding() + : ModulePass(ID), TD(nullptr), DT(nullptr) { + initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry()); +} + +void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers( + SmallVectorImpl<char> &OpConvSpecifiers, StringRef Fmt, + size_t NumOps) const { + // not all format characters are collected. + // At this time the format characters of interest + // are %p and %s, which use to know if we + // are either storing a literal string or a + // pointer to the printf buffer. + static const char ConvSpecifiers[] = "cdieEfgGaosuxXp"; + size_t CurFmtSpecifierIdx = 0; + size_t PrevFmtSpecifierIdx = 0; + + while ((CurFmtSpecifierIdx = Fmt.find_first_of( + ConvSpecifiers, CurFmtSpecifierIdx)) != StringRef::npos) { + bool ArgDump = false; + StringRef CurFmt = Fmt.substr(PrevFmtSpecifierIdx, + CurFmtSpecifierIdx - PrevFmtSpecifierIdx); + size_t pTag = CurFmt.find_last_of("%"); + if (pTag != StringRef::npos) { + ArgDump = true; + while (pTag && CurFmt[--pTag] == '%') { + ArgDump = !ArgDump; + } + } + + if (ArgDump) + OpConvSpecifiers.push_back(Fmt[CurFmtSpecifierIdx]); + + PrevFmtSpecifierIdx = ++CurFmtSpecifierIdx; + } +} + +bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier, + Type *OpType) const { + if (Specifier != 's') + return false; + const PointerType *PT = dyn_cast<PointerType>(OpType); + if (!PT || PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return false; + Type *ElemType = PT->getContainedType(0); + if (ElemType->getTypeID() != Type::IntegerTyID) + return false; + IntegerType *ElemIType = cast<IntegerType>(ElemType); + return ElemIType->getBitWidth() == 8; +} + +bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( + Module &M, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + Type *I32Ty = Type::getInt32Ty(Ctx); + unsigned UniqID = 0; + // NB: This is important for this string size to be divizable by 4 + const char NonLiteralStr[4] = "???"; + + for (auto CI : Printfs) { + unsigned NumOps = CI->getNumArgOperands(); + + SmallString<16> OpConvSpecifiers; + Value *Op = CI->getArgOperand(0); + + if (auto LI = dyn_cast<LoadInst>(Op)) { + Op = LI->getPointerOperand(); + for (auto Use : Op->users()) { + if (auto SI = dyn_cast<StoreInst>(Use)) { + Op = SI->getValueOperand(); + break; + } + } + } + + if (auto I = dyn_cast<Instruction>(Op)) { + Value *Op_simplified = simplify(I, &GetTLI(*I->getFunction())); + if (Op_simplified) + Op = Op_simplified; + } + + ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Op); + + if (ConstExpr) { + GlobalVariable *GVar = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0)); + + StringRef Str("unknown"); + if (GVar && GVar->hasInitializer()) { + auto Init = GVar->getInitializer(); + if (auto CA = dyn_cast<ConstantDataArray>(Init)) { + if (CA->isString()) + Str = CA->getAsCString(); + } else if (isa<ConstantAggregateZero>(Init)) { + Str = ""; + } + // + // we need this call to ascertain + // that we are printing a string + // or a pointer. It takes out the + // specifiers and fills up the first + // arg + getConversionSpecifiers(OpConvSpecifiers, Str, NumOps - 1); + } + // Add metadata for the string + std::string AStreamHolder; + raw_string_ostream Sizes(AStreamHolder); + int Sum = DWORD_ALIGN; + Sizes << CI->getNumArgOperands() - 1; + Sizes << ':'; + for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() && + ArgCount <= OpConvSpecifiers.size(); + ArgCount++) { + Value *Arg = CI->getArgOperand(ArgCount); + Type *ArgType = Arg->getType(); + unsigned ArgSize = TD->getTypeAllocSizeInBits(ArgType); + ArgSize = ArgSize / 8; + // + // ArgSize by design should be a multiple of DWORD_ALIGN, + // expand the arguments that do not follow this rule. + // + if (ArgSize % DWORD_ALIGN != 0) { + llvm::Type *ResType = llvm::Type::getInt32Ty(Ctx); + VectorType *LLVMVecType = llvm::dyn_cast<llvm::VectorType>(ArgType); + int NumElem = LLVMVecType ? LLVMVecType->getNumElements() : 1; + if (LLVMVecType && NumElem > 1) + ResType = llvm::VectorType::get(ResType, NumElem); + Builder.SetInsertPoint(CI); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + if (OpConvSpecifiers[ArgCount - 1] == 'x' || + OpConvSpecifiers[ArgCount - 1] == 'X' || + OpConvSpecifiers[ArgCount - 1] == 'u' || + OpConvSpecifiers[ArgCount - 1] == 'o') + Arg = Builder.CreateZExt(Arg, ResType); + else + Arg = Builder.CreateSExt(Arg, ResType); + ArgType = Arg->getType(); + ArgSize = TD->getTypeAllocSizeInBits(ArgType); + ArgSize = ArgSize / 8; + CI->setOperand(ArgCount, Arg); + } + if (OpConvSpecifiers[ArgCount - 1] == 'f') { + ConstantFP *FpCons = dyn_cast<ConstantFP>(Arg); + if (FpCons) + ArgSize = 4; + else { + FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg); + if (FpExt && FpExt->getType()->isDoubleTy() && + FpExt->getOperand(0)->getType()->isFloatTy()) + ArgSize = 4; + } + } + if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) { + if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) { + GlobalVariable *GV = + dyn_cast<GlobalVariable>(ConstExpr->getOperand(0)); + if (GV && GV->hasInitializer()) { + Constant *Init = GV->getInitializer(); + ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init); + if (Init->isZeroValue() || CA->isString()) { + size_t SizeStr = Init->isZeroValue() + ? 1 + : (strlen(CA->getAsCString().data()) + 1); + size_t Rem = SizeStr % DWORD_ALIGN; + size_t NSizeStr = 0; + LLVM_DEBUG(dbgs() << "Printf string original size = " << SizeStr + << '\n'); + if (Rem) { + NSizeStr = SizeStr + (DWORD_ALIGN - Rem); + } else { + NSizeStr = SizeStr; + } + ArgSize = NSizeStr; + } + } else { + ArgSize = sizeof(NonLiteralStr); + } + } else { + ArgSize = sizeof(NonLiteralStr); + } + } + LLVM_DEBUG(dbgs() << "Printf ArgSize (in buffer) = " << ArgSize + << " for type: " << *ArgType << '\n'); + Sizes << ArgSize << ':'; + Sum += ArgSize; + } + LLVM_DEBUG(dbgs() << "Printf format string in source = " << Str.str() + << '\n'); + for (size_t I = 0; I < Str.size(); ++I) { + // Rest of the C escape sequences (e.g. \') are handled correctly + // by the MDParser + switch (Str[I]) { + case '\a': + Sizes << "\\a"; + break; + case '\b': + Sizes << "\\b"; + break; + case '\f': + Sizes << "\\f"; + break; + case '\n': + Sizes << "\\n"; + break; + case '\r': + Sizes << "\\r"; + break; + case '\v': + Sizes << "\\v"; + break; + case ':': + // ':' cannot be scanned by Flex, as it is defined as a delimiter + // Replace it with it's octal representation \72 + Sizes << "\\72"; + break; + default: + Sizes << Str[I]; + break; + } + } + + // Insert the printf_alloc call + Builder.SetInsertPoint(CI); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + AttributeList Attr = AttributeList::get(Ctx, AttributeList::FunctionIndex, + Attribute::NoUnwind); + + Type *SizetTy = Type::getInt32Ty(Ctx); + + Type *Tys_alloc[1] = {SizetTy}; + Type *I8Ptr = PointerType::get(Type::getInt8Ty(Ctx), 1); + FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false); + FunctionCallee PrintfAllocFn = + M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr); + + LLVM_DEBUG(dbgs() << "Printf metadata = " << Sizes.str() << '\n'); + std::string fmtstr = itostr(++UniqID) + ":" + Sizes.str().c_str(); + MDString *fmtStrArray = MDString::get(Ctx, fmtstr); + + // Instead of creating global variables, the + // printf format strings are extracted + // and passed as metadata. This avoids + // polluting llvm's symbol tables in this module. + // Metadata is going to be extracted + // by the backend passes and inserted + // into the OpenCL binary as appropriate. + StringRef amd("llvm.printf.fmts"); + NamedMDNode *metaD = M.getOrInsertNamedMetadata(amd); + MDNode *myMD = MDNode::get(Ctx, fmtStrArray); + metaD->addOperand(myMD); + Value *sumC = ConstantInt::get(SizetTy, Sum, false); + SmallVector<Value *, 1> alloc_args; + alloc_args.push_back(sumC); + CallInst *pcall = + CallInst::Create(PrintfAllocFn, alloc_args, "printf_alloc_fn", CI); + + // + // Insert code to split basicblock with a + // piece of hammock code. + // basicblock splits after buffer overflow check + // + ConstantPointerNull *zeroIntPtr = + ConstantPointerNull::get(PointerType::get(Type::getInt8Ty(Ctx), 1)); + ICmpInst *cmp = + dyn_cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, "")); + if (!CI->use_empty()) { + Value *result = + Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, "printf_res"); + CI->replaceAllUsesWith(result); + } + SplitBlock(CI->getParent(), cmp); + Instruction *Brnch = + SplitBlockAndInsertIfThen(cmp, cmp->getNextNode(), false); + + Builder.SetInsertPoint(Brnch); + + // store unique printf id in the buffer + // + SmallVector<Value *, 1> ZeroIdxList; + ConstantInt *zeroInt = + ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10)); + ZeroIdxList.push_back(zeroInt); + + GetElementPtrInst *BufferIdx = + dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create( + nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch)); + + Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS); + Value *id_gep_cast = + new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch); + + StoreInst *stbuff = + new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast); + stbuff->insertBefore(Brnch); // to Remove unused variable warning + + SmallVector<Value *, 2> FourthIdxList; + ConstantInt *fourInt = + ConstantInt::get(Ctx, APInt(32, StringRef("4"), 10)); + + FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id + // the following GEP is the buffer pointer + BufferIdx = cast<GetElementPtrInst>(GetElementPtrInst::Create( + nullptr, pcall, FourthIdxList, "PrintBuffGep", Brnch)); + + Type *Int32Ty = Type::getInt32Ty(Ctx); + Type *Int64Ty = Type::getInt64Ty(Ctx); + for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() && + ArgCount <= OpConvSpecifiers.size(); + ArgCount++) { + Value *Arg = CI->getArgOperand(ArgCount); + Type *ArgType = Arg->getType(); + SmallVector<Value *, 32> WhatToStore; + if (ArgType->isFPOrFPVectorTy() && + (ArgType->getTypeID() != Type::VectorTyID)) { + Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty; + if (OpConvSpecifiers[ArgCount - 1] == 'f') { + ConstantFP *fpCons = dyn_cast<ConstantFP>(Arg); + if (fpCons) { + APFloat Val(fpCons->getValueAPF()); + bool Lost = false; + Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, + &Lost); + Arg = ConstantFP::get(Ctx, Val); + IType = Int32Ty; + } else { + FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg); + if (FpExt && FpExt->getType()->isDoubleTy() && + FpExt->getOperand(0)->getType()->isFloatTy()) { + Arg = FpExt->getOperand(0); + IType = Int32Ty; + } + } + } + Arg = new BitCastInst(Arg, IType, "PrintArgFP", Brnch); + WhatToStore.push_back(Arg); + } else if (ArgType->getTypeID() == Type::PointerTyID) { + if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) { + const char *S = NonLiteralStr; + if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) { + GlobalVariable *GV = + dyn_cast<GlobalVariable>(ConstExpr->getOperand(0)); + if (GV && GV->hasInitializer()) { + Constant *Init = GV->getInitializer(); + ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init); + if (Init->isZeroValue() || CA->isString()) { + S = Init->isZeroValue() ? "" : CA->getAsCString().data(); + } + } + } + size_t SizeStr = strlen(S) + 1; + size_t Rem = SizeStr % DWORD_ALIGN; + size_t NSizeStr = 0; + if (Rem) { + NSizeStr = SizeStr + (DWORD_ALIGN - Rem); + } else { + NSizeStr = SizeStr; + } + if (S[0]) { + char *MyNewStr = new char[NSizeStr](); + strcpy(MyNewStr, S); + int NumInts = NSizeStr / 4; + int CharC = 0; + while (NumInts) { + int ANum = *(int *)(MyNewStr + CharC); + CharC += 4; + NumInts--; + Value *ANumV = ConstantInt::get(Int32Ty, ANum, false); + WhatToStore.push_back(ANumV); + } + delete[] MyNewStr; + } else { + // Empty string, give a hint to RT it is no NULL + Value *ANumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false); + WhatToStore.push_back(ANumV); + } + } else { + uint64_t Size = TD->getTypeAllocSizeInBits(ArgType); + assert((Size == 32 || Size == 64) && "unsupported size"); + Type *DstType = (Size == 32) ? Int32Ty : Int64Ty; + Arg = new PtrToIntInst(Arg, DstType, "PrintArgPtr", Brnch); + WhatToStore.push_back(Arg); + } + } else if (ArgType->getTypeID() == Type::VectorTyID) { + Type *IType = NULL; + uint32_t EleCount = cast<VectorType>(ArgType)->getNumElements(); + uint32_t EleSize = ArgType->getScalarSizeInBits(); + uint32_t TotalSize = EleCount * EleSize; + if (EleCount == 3) { + IntegerType *Int32Ty = Type::getInt32Ty(ArgType->getContext()); + Constant *Indices[4] = { + ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 1), + ConstantInt::get(Int32Ty, 2), ConstantInt::get(Int32Ty, 2)}; + Constant *Mask = ConstantVector::get(Indices); + ShuffleVectorInst *Shuffle = new ShuffleVectorInst(Arg, Arg, Mask); + Shuffle->insertBefore(Brnch); + Arg = Shuffle; + ArgType = Arg->getType(); + TotalSize += EleSize; + } + switch (EleSize) { + default: + EleCount = TotalSize / 64; + IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext())); + break; + case 8: + if (EleCount >= 8) { + EleCount = TotalSize / 64; + IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext())); + } else if (EleCount >= 3) { + EleCount = 1; + IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext())); + } else { + EleCount = 1; + IType = dyn_cast<Type>(Type::getInt16Ty(ArgType->getContext())); + } + break; + case 16: + if (EleCount >= 3) { + EleCount = TotalSize / 64; + IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext())); + } else { + EleCount = 1; + IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext())); + } + break; + } + if (EleCount > 1) { + IType = dyn_cast<Type>(VectorType::get(IType, EleCount)); + } + Arg = new BitCastInst(Arg, IType, "PrintArgVect", Brnch); + WhatToStore.push_back(Arg); + } else { + WhatToStore.push_back(Arg); + } + for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) { + Value *TheBtCast = WhatToStore[I]; + unsigned ArgSize = + TD->getTypeAllocSizeInBits(TheBtCast->getType()) / 8; + SmallVector<Value *, 1> BuffOffset; + BuffOffset.push_back(ConstantInt::get(I32Ty, ArgSize)); + + Type *ArgPointer = PointerType::get(TheBtCast->getType(), 1); + Value *CastedGEP = + new BitCastInst(BufferIdx, ArgPointer, "PrintBuffPtrCast", Brnch); + StoreInst *StBuff = new StoreInst(TheBtCast, CastedGEP, Brnch); + LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n" + << *StBuff << '\n'); + (void)StBuff; + if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands()) + break; + BufferIdx = dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create( + nullptr, BufferIdx, BuffOffset, "PrintBuffNextPtr", Brnch)); + LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n" + << *BufferIdx << '\n'); + } + } + } + } + + // erase the printf calls + for (auto CI : Printfs) + CI->eraseFromParent(); + + Printfs.clear(); + return true; +} + +bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) { + Triple TT(M.getTargetTriple()); + if (TT.getArch() == Triple::r600) + return false; + + auto PrintfFunction = M.getFunction("printf"); + if (!PrintfFunction) + return false; + + for (auto &U : PrintfFunction->uses()) { + if (auto *CI = dyn_cast<CallInst>(U.getUser())) { + if (CI->isCallee(&U)) + Printfs.push_back(CI); + } + } + + if (Printfs.empty()) + return false; + + TD = &M.getDataLayout(); + auto DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + }; + + return lowerPrintfForGpu(M, GetTLI); +} diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index e4c9d6685d4a..3e9dcca114a3 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -801,7 +801,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - GV->setAlignment(I.getAlignment()); + GV->setAlignment(MaybeAlign(I.getAlignment())); Value *TCntY, *TCntZ; diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 815cbc5e26ee..4d78188b3dc3 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -17,9 +17,9 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -33,6 +33,7 @@ #include "AMDGPUGenRegisterBankInfo.def" using namespace llvm; +using namespace MIPatternMatch; namespace { @@ -84,9 +85,11 @@ public: }; } -AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) +AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) : AMDGPUGenRegisterBankInfo(), - TRI(static_cast<const SIRegisterInfo*>(&TRI)) { + Subtarget(ST), + TRI(Subtarget.getRegisterInfo()), + TII(Subtarget.getInstrInfo()) { // HACK: Until this is fully tablegen'd. static bool AlreadyInit = false; @@ -163,11 +166,10 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost( const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( const TargetRegisterClass &RC) const { + if (&RC == &AMDGPU::SReg_1RegClass) + return AMDGPU::VCCRegBank; - if (TRI->isSGPRClass(&RC)) - return getRegBank(AMDGPU::SGPRRegBankID); - - return getRegBank(AMDGPU::VGPRRegBankID); + return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank; } template <unsigned NumOps> @@ -192,7 +194,8 @@ AMDGPURegisterBankInfo::addMappingFromTable( Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); } - unsigned MappingID = 0; + // getInstrMapping's default mapping uses ID 1, so start at 2. + unsigned MappingID = 2; for (const auto &Entry : Table) { for (unsigned I = 0; I < NumOps; ++I) { int OpIdx = RegSrcOpIdx[I]; @@ -210,7 +213,7 @@ AMDGPURegisterBankInfo::addMappingFromTable( RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( const MachineInstr &MI, const MachineRegisterInfo &MRI) const { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_readlane: { static const OpRegBankEntry<3> Table[2] = { // Perfectly legal. @@ -251,7 +254,7 @@ RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( const MachineInstr &MI, const MachineRegisterInfo &MRI) const { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_buffer_load: { static const OpRegBankEntry<3> Table[4] = { // Perfectly legal. @@ -303,6 +306,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( } case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { + // FIXME: Should have no register for immediate static const OpRegBankEntry<1> Table[2] = { // Perfectly legal. { { AMDGPU::SGPRRegBankID }, 1 }, @@ -319,12 +323,15 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( } } -static bool isInstrUniform(const MachineInstr &MI) { +// FIXME: Returns uniform if there's no source value information. This is +// probably wrong. +static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) { if (!MI.hasOneMemOperand()) return false; const MachineMemOperand *MMO = *MI.memoperands_begin(); - return AMDGPUInstrInfo::isUniformMMO(MMO); + return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 && + AMDGPUInstrInfo::isUniformMMO(MMO); } RegisterBankInfo::InstructionMappings @@ -337,6 +344,31 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( InstructionMappings AltMappings; switch (MI.getOpcode()) { + case TargetOpcode::G_CONSTANT: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + if (Size == 1) { + static const OpRegBankEntry<1> Table[4] = { + { { AMDGPU::VGPRRegBankID }, 1 }, + { { AMDGPU::SGPRRegBankID }, 1 }, + { { AMDGPU::VCCRegBankID }, 1 }, + { { AMDGPU::SCCRegBankID }, 1 } + }; + + return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); + } + + LLVM_FALLTHROUGH; + } + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_FRAME_INDEX: + case TargetOpcode::G_GLOBAL_VALUE: { + static const OpRegBankEntry<1> Table[2] = { + { { AMDGPU::VGPRRegBankID }, 1 }, + { { AMDGPU::SGPRRegBankID }, 1 } + }; + + return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); + } case TargetOpcode::G_AND: case TargetOpcode::G_OR: case TargetOpcode::G_XOR: { @@ -408,23 +440,29 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( AltMappings.push_back(&VSMapping); break; } - case TargetOpcode::G_LOAD: { + case TargetOpcode::G_LOAD: + case TargetOpcode::G_ZEXTLOAD: + case TargetOpcode::G_SEXTLOAD: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); + unsigned PtrSize = PtrTy.getSizeInBits(); + unsigned AS = PtrTy.getAddressSpace(); LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); - // FIXME: Should we be hard coding the size for these mappings? - if (isInstrUniform(MI)) { + if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && + AS != AMDGPUAS::PRIVATE_ADDRESS) && + isInstrUniformNonExtLoadAlign4(MI)) { const InstructionMapping &SSMapping = getInstructionMapping( 1, 1, getOperandsMapping( {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 2); // Num Operands AltMappings.push_back(&SSMapping); } const InstructionMapping &VVMapping = getInstructionMapping( 2, 1, getOperandsMapping( - {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), + {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 2); // Num Operands AltMappings.push_back(&VVMapping); @@ -620,57 +658,53 @@ static LLT getHalfSizedType(LLT Ty) { /// /// There is additional complexity to try for compare values to identify the /// unique values used. -void AMDGPURegisterBankInfo::executeInWaterfallLoop( - MachineInstr &MI, MachineRegisterInfo &MRI, - ArrayRef<unsigned> OpIndices) const { - MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - MachineBasicBlock::iterator I(MI); - - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - // Use a set to avoid extra readfirstlanes in the case where multiple operands - // are the same register. - SmallSet<Register, 4> SGPROperandRegs; - for (unsigned Op : OpIndices) { - assert(MI.getOperand(Op).isUse()); - Register Reg = MI.getOperand(Op).getReg(); - const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); - if (OpBank->getID() == AMDGPU::VGPRRegBankID) - SGPROperandRegs.insert(Reg); - } - - // No operands need to be replaced, so no need to loop. - if (SGPROperandRegs.empty()) - return; - - MachineIRBuilder B(MI); +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineIRBuilder &B, + iterator_range<MachineBasicBlock::iterator> Range, + SmallSet<Register, 4> &SGPROperandRegs, + MachineRegisterInfo &MRI) const { SmallVector<Register, 4> ResultRegs; SmallVector<Register, 4> InitResultRegs; SmallVector<Register, 4> PhiRegs; - for (MachineOperand &Def : MI.defs()) { - LLT ResTy = MRI.getType(Def.getReg()); - const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); - ResultRegs.push_back(Def.getReg()); - Register InitReg = B.buildUndef(ResTy).getReg(0); - Register PhiReg = MRI.createGenericVirtualRegister(ResTy); - InitResultRegs.push_back(InitReg); - PhiRegs.push_back(PhiReg); - MRI.setRegBank(PhiReg, *DefBank); - MRI.setRegBank(InitReg, *DefBank); + + MachineBasicBlock &MBB = B.getMBB(); + MachineFunction *MF = &B.getMF(); + + const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); + const unsigned WaveAndOpc = Subtarget.isWave32() ? + AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const unsigned MovTermOpc = Subtarget.isWave32() ? + AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; + const unsigned XorTermOpc = Subtarget.isWave32() ? + AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; + const unsigned AndSaveExecOpc = Subtarget.isWave32() ? + AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + const unsigned ExecReg = Subtarget.isWave32() ? + AMDGPU::EXEC_LO : AMDGPU::EXEC; + + for (MachineInstr &MI : Range) { + for (MachineOperand &Def : MI.defs()) { + LLT ResTy = MRI.getType(Def.getReg()); + const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); + ResultRegs.push_back(Def.getReg()); + Register InitReg = B.buildUndef(ResTy).getReg(0); + Register PhiReg = MRI.createGenericVirtualRegister(ResTy); + InitResultRegs.push_back(InitReg); + PhiRegs.push_back(PhiReg); + MRI.setRegBank(PhiReg, *DefBank); + MRI.setRegBank(InitReg, *DefBank); + } } - Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + Register SaveExecReg = MRI.createVirtualRegister(WaveRC); + Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); // Don't bother using generic instructions/registers for the exec mask. B.buildInstr(TargetOpcode::IMPLICIT_DEF) .addDef(InitSaveExecReg); - Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register PhiExec = MRI.createVirtualRegister(WaveRC); + Register NewExec = MRI.createVirtualRegister(WaveRC); // To insert the loop we need to split the block. Move everything before this // point to a new block, and insert a new empty block before this instruction. @@ -688,7 +722,7 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop( // Move the rest of the block into a new block. RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); MBB.addSuccessor(LoopBB); RestoreExecBB->addSuccessor(RemainderBB); @@ -711,164 +745,173 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop( .addMBB(LoopBB); } - // Move the instruction into the loop. - LoopBB->splice(LoopBB->end(), &MBB, I); - I = std::prev(LoopBB->end()); + const DebugLoc &DL = B.getDL(); + + // Figure out the iterator range after splicing the instructions. + auto NewBegin = std::prev(LoopBB->end()); - B.setInstr(*I); + // Move the instruction into the loop. Note we moved everything after + // Range.end() already into a new block, so Range.end() is no longer valid. + LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); + + auto NewEnd = LoopBB->end(); + + MachineBasicBlock::iterator I = Range.begin(); + B.setInsertPt(*LoopBB, I); Register CondReg; - for (MachineOperand &Op : MI.uses()) { - if (!Op.isReg()) - continue; + for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { + for (MachineOperand &Op : MI.uses()) { + if (!Op.isReg() || Op.isDef()) + continue; - assert(!Op.isDef()); - if (SGPROperandRegs.count(Op.getReg())) { - LLT OpTy = MRI.getType(Op.getReg()); - unsigned OpSize = OpTy.getSizeInBits(); - - // Can only do a readlane of 32-bit pieces. - if (OpSize == 32) { - // Avoid extra copies in the simple case of one 32-bit register. - Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - MRI.setType(CurrentLaneOpReg, OpTy); - - constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) - .addReg(Op.getReg()); - - Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; - - // Compare the just read M0 value to all possible Idx values. - B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(Op.getReg()); - Op.setReg(CurrentLaneOpReg); - - if (!First) { - Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - - // If there are multiple operands to consider, and the conditions. - B.buildInstr(AMDGPU::S_AND_B64) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; - } - } else { - LLT S32 = LLT::scalar(32); - SmallVector<Register, 8> ReadlanePieces; + if (SGPROperandRegs.count(Op.getReg())) { + LLT OpTy = MRI.getType(Op.getReg()); + unsigned OpSize = OpTy.getSizeInBits(); - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. + // Can only do a readlane of 32-bit pieces. + if (OpSize == 32) { + // Avoid extra copies in the simple case of one 32-bit register. + Register CurrentLaneOpReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + MRI.setType(CurrentLaneOpReg, OpTy); - bool Is64 = OpSize % 64 == 0; + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(Op.getReg()); - LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); - unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 - : AMDGPU::V_CMP_EQ_U32_e64; + Register NewCondReg = MRI.createVirtualRegister(WaveRC); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. + // Compare the just read M0 value to all possible Idx values. + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(Op.getReg()); + Op.setReg(CurrentLaneOpReg); - // Insert the unmerge before the loop. + if (!First) { + Register AndReg = MRI.createVirtualRegister(WaveRC); - B.setMBB(MBB); - auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); - B.setInstr(*I); + // If there are multiple operands to consider, and the conditions. + B.buildInstr(WaveAndOpc) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } else { + LLT S32 = LLT::scalar(32); + SmallVector<Register, 8> ReadlanePieces; - unsigned NumPieces = Unmerge->getNumOperands() - 1; - for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { - unsigned UnmergePiece = Unmerge.getReg(PieceIdx); + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. - Register CurrentLaneOpReg; - if (Is64) { - Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); - Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); + bool Is64 = OpSize % 64 == 0; - MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); - MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); - MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); + LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); + unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 + : AMDGPU::V_CMP_EQ_U32_e64; - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegLo) - .addReg(UnmergePiece, 0, AMDGPU::sub0); + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegHi) - .addReg(UnmergePiece, 0, AMDGPU::sub1); + // Insert the unmerge before the loop. - CurrentLaneOpReg = - B.buildMerge(LLT::scalar(64), - {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) - .getReg(0); + B.setMBB(MBB); + auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); + B.setInstr(*I); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + unsigned NumPieces = Unmerge->getNumOperands() - 1; + for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { + Register UnmergePiece = Unmerge.getReg(PieceIdx); - if (OpTy.getScalarSizeInBits() == 64) { - // If we need to produce a 64-bit element vector, so use the - // merged pieces - ReadlanePieces.push_back(CurrentLaneOpReg); + Register CurrentLaneOpReg; + if (Is64) { + Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); + Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); + + MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); + MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); + MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegLo) + .addReg(UnmergePiece, 0, AMDGPU::sub0); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegHi) + .addReg(UnmergePiece, 0, AMDGPU::sub1); + + CurrentLaneOpReg = + B.buildMerge(LLT::scalar(64), + {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) + .getReg(0); + + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + + if (OpTy.getScalarSizeInBits() == 64) { + // If we need to produce a 64-bit element vector, so use the + // merged pieces + ReadlanePieces.push_back(CurrentLaneOpReg); + } else { + // 32-bit element type. + ReadlanePieces.push_back(CurrentLaneOpRegLo); + ReadlanePieces.push_back(CurrentLaneOpRegHi); + } } else { - // 32-bit element type. - ReadlanePieces.push_back(CurrentLaneOpRegLo); - ReadlanePieces.push_back(CurrentLaneOpRegHi); + CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); + MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(UnmergePiece); + ReadlanePieces.push_back(CurrentLaneOpReg); } - } else { - CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); - MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpReg) - .addReg(UnmergePiece); - ReadlanePieces.push_back(CurrentLaneOpReg); - } + Register NewCondReg = MRI.createVirtualRegister(WaveRC); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; - Register NewCondReg - = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; + B.buildInstr(CmpOp) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(UnmergePiece); - B.buildInstr(CmpOp) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(UnmergePiece); + if (!First) { + Register AndReg = MRI.createVirtualRegister(WaveRC); - if (!First) { - Register AndReg - = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + // If there are multiple operands to consider, and the conditions. + B.buildInstr(WaveAndOpc) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } - // If there are multiple operands to consider, and the conditions. - B.buildInstr(AMDGPU::S_AND_B64) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; + // FIXME: Build merge seems to switch to CONCAT_VECTORS but not + // BUILD_VECTOR + if (OpTy.isVector()) { + auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } else { + auto Merge = B.buildMerge(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); } - } - // FIXME: Build merge seems to switch to CONCAT_VECTORS but not - // BUILD_VECTOR - if (OpTy.isVector()) { - auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); - } else { - auto Merge = B.buildMerge(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); + MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); } - - MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); } } } @@ -876,16 +919,16 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop( B.setInsertPt(*LoopBB, LoopBB->end()); // Update EXEC, save the original EXEC value to VCC. - B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64) + B.buildInstr(AndSaveExecOpc) .addDef(NewExec) .addReg(CondReg, RegState::Kill); MRI.setSimpleHint(NewExec, CondReg); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - B.buildInstr(AMDGPU::S_XOR_B64_term) - .addDef(AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + B.buildInstr(XorTermOpc) + .addDef(ExecReg) + .addReg(ExecReg) .addReg(NewExec); // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use @@ -896,14 +939,60 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop( .addMBB(LoopBB); // Save the EXEC mask before the loop. - BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) + .addReg(ExecReg); // Restore the EXEC mask after the loop. B.setMBB(*RestoreExecBB); - B.buildInstr(AMDGPU::S_MOV_B64_term) - .addDef(AMDGPU::EXEC) + B.buildInstr(MovTermOpc) + .addDef(ExecReg) .addReg(SaveExecReg); + + // Restore the insert point before the original instruction. + B.setInsertPt(MBB, MBB.end()); + + return true; +} + +// Return any unique registers used by \p MI at \p OpIndices that need to be +// handled in a waterfall loop. Returns these registers in \p +// SGPROperandRegs. Returns true if there are any operansd to handle and a +// waterfall loop is necessary. +bool AMDGPURegisterBankInfo::collectWaterfallOperands( + SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, + MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { + for (unsigned Op : OpIndices) { + assert(MI.getOperand(Op).isUse()); + Register Reg = MI.getOperand(Op).getReg(); + const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); + if (OpBank->getID() == AMDGPU::VGPRRegBankID) + SGPROperandRegs.insert(Reg); + } + + // No operands need to be replaced, so no need to loop. + return !SGPROperandRegs.empty(); +} + +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const { + // Use a set to avoid extra readfirstlanes in the case where multiple operands + // are the same register. + SmallSet<Register, 4> SGPROperandRegs; + + if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) + return false; + + MachineBasicBlock::iterator I = MI.getIterator(); + return executeInWaterfallLoop(B, make_range(I, std::next(I)), + SGPROperandRegs, MRI); +} + +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const { + MachineIRBuilder B(MI); + return executeInWaterfallLoop(B, MI, MRI, OpIndices); } // Legalize an operand that must be an SGPR by inserting a readfirstlane. @@ -960,8 +1049,13 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); // If the pointer is an SGPR, we have nothing to do. - if (SrcRegs.empty()) - return false; + if (SrcRegs.empty()) { + Register PtrReg = MI.getOperand(1).getReg(); + const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); + if (PtrBank == &AMDGPU::SGPRRegBank) + return false; + SrcRegs.push_back(PtrReg); + } assert(LoadSize % MaxNonSmrdLoadSize == 0); @@ -1013,6 +1107,33 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, return true; } +bool AMDGPURegisterBankInfo::applyMappingImage( + MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI, int RsrcIdx) const { + const int NumDefs = MI.getNumExplicitDefs(); + + // The reported argument index is relative to the IR intrinsic call arguments, + // so we need to shift by the number of defs and the intrinsic ID. + RsrcIdx += NumDefs + 1; + + // Insert copies to VGPR arguments. + applyDefaultMapping(OpdMapper); + + // Fixup any SGPR arguments. + SmallVector<unsigned, 4> SGPRIndexes; + for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { + if (!MI.getOperand(I).isReg()) + continue; + + // If this intrinsic has a sampler, it immediately follows rsrc. + if (I == RsrcIdx || I == RsrcIdx + 1) + SGPRIndexes.push_back(I); + } + + executeInWaterfallLoop(MI, MRI, SGPRIndexes); + return true; +} + // For cases where only a single copy is inserted for matching register banks. // Replace the register in the instruction operand static void substituteSimpleCopyRegs( @@ -1024,6 +1145,184 @@ static void substituteSimpleCopyRegs( } } +/// Handle register layout difference for f16 images for some subtargets. +Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, + MachineRegisterInfo &MRI, + Register Reg) const { + if (!Subtarget.hasUnpackedD16VMem()) + return Reg; + + const LLT S16 = LLT::scalar(16); + LLT StoreVT = MRI.getType(Reg); + if (!StoreVT.isVector() || StoreVT.getElementType() != S16) + return Reg; + + auto Unmerge = B.buildUnmerge(S16, Reg); + + + SmallVector<Register, 4> WideRegs; + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + WideRegs.push_back(Unmerge.getReg(I)); + + const LLT S32 = LLT::scalar(32); + int NumElts = StoreVT.getNumElements(); + + return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); +} + +static std::pair<Register, unsigned> +getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { + int64_t Const; + if (mi_match(Reg, MRI, m_ICst(Const))) + return std::make_pair(Register(), Const); + + Register Base; + if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) + return std::make_pair(Base, Const); + + // TODO: Handle G_OR used for add case + return std::make_pair(Reg, 0); +} + +std::pair<Register, unsigned> +AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, + Register OrigOffset) const { + const unsigned MaxImm = 4095; + Register BaseReg; + unsigned ImmOffset; + const LLT S32 = LLT::scalar(32); + + std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), + OrigOffset); + + unsigned C1 = 0; + if (ImmOffset != 0) { + // If the immediate value is too big for the immoffset field, put the value + // and -4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store. + // However, do not do that rounding down to a multiple of 4096 if that is a + // negative number, as it appears to be illegal to have a negative offset + // in the vgpr, even if adding the immediate offset makes it positive. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + if ((int32_t)Overflow < 0) { + Overflow += ImmOffset; + ImmOffset = 0; + } + + C1 = ImmOffset; + if (Overflow != 0) { + if (!BaseReg) + BaseReg = B.buildConstant(S32, Overflow).getReg(0); + else { + auto OverflowVal = B.buildConstant(S32, Overflow); + BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); + } + } + } + + if (!BaseReg) + BaseReg = B.buildConstant(S32, 0).getReg(0); + + return {BaseReg, C1}; +} + +static bool isZero(Register Reg, MachineRegisterInfo &MRI) { + int64_t C; + return mi_match(Reg, MRI, m_ICst(C)) && C == 0; +} + +static unsigned extractGLC(unsigned CachePolicy) { + return CachePolicy & 1; +} + +static unsigned extractSLC(unsigned CachePolicy) { + return (CachePolicy >> 1) & 1; +} + +static unsigned extractDLC(unsigned CachePolicy) { + return (CachePolicy >> 2) & 1; +} + +MachineInstr * +AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, + MachineInstr &MI) const { + MachineRegisterInfo &MRI = *B.getMRI(); + executeInWaterfallLoop(B, MI, MRI, {2, 4}); + + // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. + + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(VData); + + int EltSize = Ty.getScalarSizeInBits(); + int Size = Ty.getSizeInBits(); + + // FIXME: Broken integer truncstore. + if (EltSize != 32) + report_fatal_error("unhandled intrinsic store"); + + // FIXME: Verifier should enforce 1 MMO for these intrinsics. + const int MemSize = (*MI.memoperands_begin())->getSize(); + + + Register RSrc = MI.getOperand(2).getReg(); + Register VOffset = MI.getOperand(3).getReg(); + Register SOffset = MI.getOperand(4).getReg(); + unsigned CachePolicy = MI.getOperand(5).getImm(); + + unsigned ImmOffset; + std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); + + const bool Offen = !isZero(VOffset, MRI); + + unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; + switch (8 * MemSize) { + case 8: + Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : + AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; + break; + case 16: + Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : + AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; + break; + default: + Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : + AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; + if (Size > 32) + Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); + break; + } + + + // Set the insertion point back to the instruction in case it was moved into a + // loop. + B.setInstr(MI); + + MachineInstrBuilder MIB = B.buildInstr(Opc) + .addUse(VData); + + if (Offen) + MIB.addUse(VOffset); + + MIB.addUse(RSrc) + .addUse(SOffset) + .addImm(ImmOffset) + .addImm(extractGLC(CachePolicy)) + .addImm(extractSLC(CachePolicy)) + .addImm(0) // tfe: FIXME: Remove from inst + .addImm(extractDLC(CachePolicy)) + .cloneMemRefs(MI); + + // FIXME: We need a way to report failure from applyMappingImpl. + // Insert constrain copies before inserting the loop. + if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) + report_fatal_error("failed to constrain selected store intrinsic"); + + return MIB; +} + void AMDGPURegisterBankInfo::applyMappingImpl( const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -1289,12 +1588,202 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } - case AMDGPU::G_EXTRACT_VECTOR_ELT: - applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, { 2 }); + case AMDGPU::G_BUILD_VECTOR: + case AMDGPU::G_BUILD_VECTOR_TRUNC: { + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy != LLT::vector(2, 16)) + break; + + assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); + substituteSimpleCopyRegs(OpdMapper, 1); + substituteSimpleCopyRegs(OpdMapper, 2); + + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + if (DstBank == &AMDGPU::SGPRRegBank) + break; // Can use S_PACK_* instructions. + + MachineIRBuilder B(MI); + + Register Lo = MI.getOperand(1).getReg(); + Register Hi = MI.getOperand(2).getReg(); + const LLT S32 = LLT::scalar(32); + + const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI); + const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI); + + Register ZextLo; + Register ShiftHi; + + if (Opc == AMDGPU::G_BUILD_VECTOR) { + ZextLo = B.buildZExt(S32, Lo).getReg(0); + MRI.setRegBank(ZextLo, *BankLo); + + Register ZextHi = B.buildZExt(S32, Hi).getReg(0); + MRI.setRegBank(ZextHi, *BankHi); + + auto ShiftAmt = B.buildConstant(S32, 16); + MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); + + ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); + MRI.setRegBank(ShiftHi, *BankHi); + } else { + Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); + MRI.setRegBank(MaskLo, *BankLo); + + auto ShiftAmt = B.buildConstant(S32, 16); + MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); + + ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); + MRI.setRegBank(ShiftHi, *BankHi); + + ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); + MRI.setRegBank(ZextLo, *BankLo); + } + + auto Or = B.buildOr(S32, ZextLo, ShiftHi); + MRI.setRegBank(Or.getReg(0), *DstBank); + + B.buildBitcast(DstReg, Or); + MI.eraseFromParent(); + return; + } + case AMDGPU::G_EXTRACT_VECTOR_ELT: { + SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); + + assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); + + if (DstRegs.empty()) { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 2 }); + return; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register IdxReg = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(DstReg); + (void)DstTy; + + assert(DstTy.getSizeInBits() == 64); + + LLT SrcTy = MRI.getType(SrcReg); + const LLT S32 = LLT::scalar(32); + LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + + MachineIRBuilder B(MI); + auto CastSrc = B.buildBitcast(Vec32, SrcReg); + auto One = B.buildConstant(S32, 1); + + // Split the vector index into 32-bit pieces. Prepare to move all of the + // new instructions into a waterfall loop if necessary. + // + // Don't put the bitcast or constant in the loop. + MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + + // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). + auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxHi = B.buildAdd(S32, IdxLo, One); + B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); + B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); + + const ValueMapping &DstMapping + = OpdMapper.getInstrMapping().getOperandMapping(0); + + // FIXME: Should be getting from mapping or not? + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank); + MRI.setRegBank(CastSrc.getReg(0), *SrcBank); + MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); + + SmallSet<Register, 4> OpsToWaterfall; + if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { + MI.eraseFromParent(); + return; + } + + // Remove the original instruction to avoid potentially confusing the + // waterfall loop logic. + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); + return; + } + case AMDGPU::G_INSERT_VECTOR_ELT: { + SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); + + assert(OpdMapper.getVRegs(0).empty()); + assert(OpdMapper.getVRegs(1).empty()); + assert(OpdMapper.getVRegs(3).empty()); + + if (InsRegs.empty()) { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 3 }); + return; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register InsReg = MI.getOperand(2).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT InsTy = MRI.getType(InsReg); + (void)InsTy; + + assert(InsTy.getSizeInBits() == 64); + + const LLT S32 = LLT::scalar(32); + LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + + MachineIRBuilder B(MI); + auto CastSrc = B.buildBitcast(Vec32, SrcReg); + auto One = B.buildConstant(S32, 1); + + // Split the vector index into 32-bit pieces. Prepare to move all of the + // new instructions into a waterfall loop if necessary. + // + // Don't put the bitcast or constant in the loop. + MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + + // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). + auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxHi = B.buildAdd(S32, IdxLo, One); + + auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); + auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); + B.buildBitcast(DstReg, InsHi); + + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI); + + MRI.setRegBank(InsReg, *InsSrcBank); + MRI.setRegBank(CastSrc.getReg(0), *SrcBank); + MRI.setRegBank(InsLo.getReg(0), *DstBank); + MRI.setRegBank(InsHi.getReg(0), *DstBank); + MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); + + + SmallSet<Register, 4> OpsToWaterfall; + if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { + MI.eraseFromParent(); + return; + } + + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); return; + } case AMDGPU::G_INTRINSIC: { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_s_buffer_load: { // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS executeInWaterfallLoop(MI, MRI, { 2, 3 }); @@ -1303,8 +1792,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_readlane: { substituteSimpleCopyRegs(OpdMapper, 2); - assert(empty(OpdMapper.getVRegs(0))); - assert(empty(OpdMapper.getVRegs(3))); + assert(OpdMapper.getVRegs(0).empty()); + assert(OpdMapper.getVRegs(3).empty()); // Make sure the index is an SGPR. It doesn't make sense to run this in a // waterfall loop, so assume it's a uniform value. @@ -1312,9 +1801,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case Intrinsic::amdgcn_writelane: { - assert(empty(OpdMapper.getVRegs(0))); - assert(empty(OpdMapper.getVRegs(2))); - assert(empty(OpdMapper.getVRegs(3))); + assert(OpdMapper.getVRegs(0).empty()); + assert(OpdMapper.getVRegs(2).empty()); + assert(OpdMapper.getVRegs(3).empty()); substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val constrainOpWithReadfirstlane(MI, MRI, 2); // Source value @@ -1327,7 +1816,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( break; } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + auto IntrID = MI.getIntrinsicID(); + switch (IntrID) { case Intrinsic::amdgcn_buffer_load: { executeInWaterfallLoop(MI, MRI, { 2 }); return; @@ -1335,23 +1825,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { // This is only allowed to execute with 1 lane, so readfirstlane is safe. - assert(empty(OpdMapper.getVRegs(0))); + assert(OpdMapper.getVRegs(0).empty()); substituteSimpleCopyRegs(OpdMapper, 3); constrainOpWithReadfirstlane(MI, MRI, 2); // M0 return; } + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_br: { + // Only the first lane is executes, so readfirstlane is safe. + substituteSimpleCopyRegs(OpdMapper, 1); + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + return; + } + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: { + // Only the first lane is executes, so readfirstlane is safe. + constrainOpWithReadfirstlane(MI, MRI, 1); // M0 + return; + } case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // FIXME: Should this use a waterfall loop? constrainOpWithReadfirstlane(MI, MRI, 2); // M0 return; } - default: + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_tbuffer_store: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {2, 4}); + return; + } + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_tbuffer_load: + case Intrinsic::amdgcn_struct_tbuffer_store: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {2, 5}); + return; + } + default: { + if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = + AMDGPU::lookupRsrcIntrinsic(IntrID)) { + // Non-images can have complications from operands that allow both SGPR + // and VGPR. For now it's too complicated to figure out the final opcode + // to derive the register bank from the MCInstrDesc. + if (RSrcIntrin->IsImage) { + applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); + return; + } + } + break; } + } break; } - case AMDGPU::G_LOAD: { + case AMDGPU::G_LOAD: + case AMDGPU::G_ZEXTLOAD: + case AMDGPU::G_SEXTLOAD: { if (applyMappingWideLoad(MI, OpdMapper, MRI)) return; break; @@ -1452,25 +1989,71 @@ AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { } const RegisterBankInfo::InstructionMapping & +AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, + const MachineInstr &MI, + int RsrcIdx) const { + // The reported argument index is relative to the IR intrinsic call arguments, + // so we need to shift by the number of defs and the intrinsic ID. + RsrcIdx += MI.getNumExplicitDefs() + 1; + + const int NumOps = MI.getNumOperands(); + SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); + + // TODO: Should packed/unpacked D16 difference be reported here as part of + // the value mapping? + for (int I = 0; I != NumOps; ++I) { + if (!MI.getOperand(I).isReg()) + continue; + + Register OpReg = MI.getOperand(I).getReg(); + unsigned Size = getSizeInBits(OpReg, MRI, *TRI); + + // FIXME: Probably need a new intrinsic register bank searchable table to + // handle arbitrary intrinsics easily. + // + // If this has a sampler, it immediately follows rsrc. + const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; + + if (MustBeSGPR) { + // If this must be an SGPR, so we must report whatever it is as legal. + unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID); + OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); + } else { + // Some operands must be VGPR, and these are easy to copy to. + OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + } + } + + return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); +} + +const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); + SmallVector<const ValueMapping*, 2> OpdsMapping(2); unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); - unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + Register PtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(PtrReg); + unsigned AS = PtrTy.getAddressSpace(); + unsigned PtrSize = PtrTy.getSizeInBits(); const ValueMapping *ValMapping; const ValueMapping *PtrMapping; - if (isInstrUniform(MI)) { + const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); + + if (PtrBank == &AMDGPU::SGPRRegBank && + (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && + AS != AMDGPUAS::PRIVATE_ADDRESS) && + isInstrUniformNonExtLoadAlign4(MI)) { // We have a uniform instruction so we want to use an SMRD load ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); } else { ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); - // FIXME: What would happen if we used SGPRRegBankID here? PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); } @@ -1494,6 +2077,31 @@ AMDGPURegisterBankInfo::getRegBankID(Register Reg, return Bank ? Bank->getID() : Default; } + +static unsigned regBankUnion(unsigned RB0, unsigned RB1) { + return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ? + AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; +} + +const RegisterBankInfo::ValueMapping * +AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + // Lie and claim anything is legal, even though this needs to be an SGPR + // applyMapping will have to deal with it as a waterfall loop. + unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID); + unsigned Size = getSizeInBits(Reg, MRI, TRI); + return AMDGPU::getValueMapping(Bank, Size); +} + +const RegisterBankInfo::ValueMapping * +AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const { + unsigned Size = getSizeInBits(Reg, MRI, TRI); + return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); +} + /// /// This function must return a legal mapping, because /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called @@ -1536,7 +2144,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { int ResultBank = -1; for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { - unsigned Reg = MI.getOperand(I).getReg(); + Register Reg = MI.getOperand(I).getReg(); const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); // FIXME: Assuming VGPR for any undetermined inputs. @@ -1660,7 +2268,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { LLVM_FALLTHROUGH; } - case AMDGPU::G_GEP: case AMDGPU::G_ADD: case AMDGPU::G_SUB: @@ -1669,15 +2276,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_LSHR: case AMDGPU::G_ASHR: case AMDGPU::G_UADDO: - case AMDGPU::G_SADDO: case AMDGPU::G_USUBO: - case AMDGPU::G_SSUBO: case AMDGPU::G_UADDE: case AMDGPU::G_SADDE: case AMDGPU::G_USUBE: case AMDGPU::G_SSUBE: - case AMDGPU::G_UMULH: - case AMDGPU::G_SMULH: case AMDGPU::G_SMIN: case AMDGPU::G_SMAX: case AMDGPU::G_UMIN: @@ -1692,17 +2295,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FPTOUI: case AMDGPU::G_FMUL: case AMDGPU::G_FMA: + case AMDGPU::G_FMAD: case AMDGPU::G_FSQRT: + case AMDGPU::G_FFLOOR: + case AMDGPU::G_FCEIL: + case AMDGPU::G_FRINT: case AMDGPU::G_SITOFP: case AMDGPU::G_UITOFP: case AMDGPU::G_FPTRUNC: case AMDGPU::G_FPEXT: case AMDGPU::G_FEXP2: case AMDGPU::G_FLOG2: + case AMDGPU::G_FMINNUM: + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: case AMDGPU::G_FCANONICALIZE: case AMDGPU::G_INTRINSIC_TRUNC: case AMDGPU::G_INTRINSIC_ROUND: + case AMDGPU::G_AMDGPU_FFBH_U32: + return getDefaultMappingVOP(MI); + case AMDGPU::G_UMULH: + case AMDGPU::G_SMULH: { + if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) + return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); + } case AMDGPU::G_IMPLICIT_DEF: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); @@ -1710,12 +2328,19 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case AMDGPU::G_FCONSTANT: case AMDGPU::G_CONSTANT: - case AMDGPU::G_FRAME_INDEX: + case AMDGPU::G_GLOBAL_VALUE: case AMDGPU::G_BLOCK_ADDR: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } + case AMDGPU::G_FRAME_INDEX: { + // TODO: This should be the same as other constants, but eliminateFrameIndex + // currently assumes VALU uses. + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + break; + } case AMDGPU::G_INSERT: { unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; @@ -1737,8 +2362,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = nullptr; break; } - case AMDGPU::G_MERGE_VALUES: case AMDGPU::G_BUILD_VECTOR: + case AMDGPU::G_BUILD_VECTOR_TRUNC: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + if (DstTy == LLT::vector(2, 16)) { + unsigned DstSize = DstTy.getSizeInBits(); + unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); + + OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); + OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); + break; + } + + LLVM_FALLTHROUGH; + } + case AMDGPU::G_MERGE_VALUES: case AMDGPU::G_CONCAT_VECTORS: { unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; @@ -1760,6 +2402,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_CTTZ_ZERO_UNDEF: case AMDGPU::G_CTPOP: case AMDGPU::G_BSWAP: + case AMDGPU::G_BITREVERSE: case AMDGPU::G_FABS: case AMDGPU::G_FNEG: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -1848,7 +2491,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { Op3Bank == AMDGPU::SGPRRegBankID && (Size == 32 || (Size == 64 && (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && - MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64())); + Subtarget.hasScalarCompareEq64())); unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; @@ -1859,14 +2502,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_EXTRACT_VECTOR_ELT: { - unsigned OutputBankID = isSALUMapping(MI) ? - AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + // VGPR index can be used for waterfall when indexing a SGPR vector. + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); - OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize); + OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); // The index can be either if the source vector is VGPR. OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); @@ -1879,15 +2524,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); - unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), + MRI, *TRI); + unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize); + OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID, + InsertSize); // The index can be either if the source vector is VGPR. - OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); break; } case AMDGPU::G_UNMERGE_VALUES: { @@ -1903,11 +2551,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_INTRINSIC: { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { + switch (MI.getIntrinsicID()) { default: return getInvalidInstructionMapping(); - case Intrinsic::maxnum: - case Intrinsic::minnum: case Intrinsic::amdgcn_div_fmas: case Intrinsic::amdgcn_trig_preop: case Intrinsic::amdgcn_sin: @@ -1938,6 +2584,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mbcnt_hi: case Intrinsic::amdgcn_ubfe: case Intrinsic::amdgcn_sbfe: + case Intrinsic::amdgcn_mul_u24: + case Intrinsic::amdgcn_mul_i24: case Intrinsic::amdgcn_lerp: case Intrinsic::amdgcn_sad_u8: case Intrinsic::amdgcn_msad_u8: @@ -1956,10 +2604,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_udot4: case Intrinsic::amdgcn_sdot8: case Intrinsic::amdgcn_udot8: - case Intrinsic::amdgcn_fdiv_fast: case Intrinsic::amdgcn_wwm: case Intrinsic::amdgcn_wqm: return getDefaultMappingVOP(MI); + case Intrinsic::amdgcn_ds_swizzle: case Intrinsic::amdgcn_ds_permute: case Intrinsic::amdgcn_ds_bpermute: case Intrinsic::amdgcn_update_dpp: @@ -2040,7 +2688,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case Intrinsic::amdgcn_readlane: { // This must be an SGPR, but accept a VGPR. - unsigned IdxReg = MI.getOperand(3).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); @@ -2055,10 +2703,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case Intrinsic::amdgcn_writelane: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - unsigned SrcReg = MI.getOperand(2).getReg(); + Register SrcReg = MI.getOperand(2).getReg(); unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); - unsigned IdxReg = MI.getOperand(3).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); @@ -2081,9 +2729,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { - default: - return getInvalidInstructionMapping(); + auto IntrID = MI.getIntrinsicID(); + switch (IntrID) { case Intrinsic::amdgcn_s_getreg: case Intrinsic::amdgcn_s_memtime: case Intrinsic::amdgcn_s_memrealtime: @@ -2123,18 +2770,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); break; case Intrinsic::amdgcn_exp: - OpdsMapping[0] = nullptr; // IntrinsicID - // FIXME: These are immediate values which can't be read from registers. - OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); - OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); // FIXME: Could we support packed types here? OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); - // FIXME: These are immediate values which can't be read from registers. - OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); - OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); break; case Intrinsic::amdgcn_buffer_load: { Register RSrc = MI.getOperand(2).getReg(); // SGPR @@ -2169,11 +2809,97 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); break; } - case Intrinsic::amdgcn_end_cf: { + case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_init_exec: { + unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } + case Intrinsic::amdgcn_else: { + unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); + break; + } + case Intrinsic::amdgcn_kill: { + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + break; + } + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_tbuffer_load: { + // FIXME: Should make intrinsic ID the last operand of the instruction, + // then this would be the same as store + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_tbuffer_store: { + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_tbuffer_load: { + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_tbuffer_store: { + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_init_exec_from_input: { unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_br: { + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + + // This must be an SGPR, but accept a VGPR. + unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: { + // This must be an SGPR, but accept a VGPR. + unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); + break; + } + default: + if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = + AMDGPU::lookupRsrcIntrinsic(IntrID)) { + // Non-images can have complications from operands that allow both SGPR + // and VGPR. For now it's too complicated to figure out the final opcode + // to derive the register bank from the MCInstrDesc. + if (RSrcIntrin->IsImage) + return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); + } + + return getInvalidInstructionMapping(); } break; } @@ -2216,6 +2942,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case AMDGPU::G_LOAD: + case AMDGPU::G_ZEXTLOAD: + case AMDGPU::G_SEXTLOAD: return getInstrMappingForLoad(MI); case AMDGPU::G_ATOMICRMW_XCHG: @@ -2228,6 +2956,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ATOMICRMW_MIN: case AMDGPU::G_ATOMICRMW_UMAX: case AMDGPU::G_ATOMICRMW_UMIN: + case AMDGPU::G_ATOMICRMW_FADD: case AMDGPU::G_ATOMIC_CMPXCHG: { return getDefaultMappingAllVGPR(MI); } @@ -2247,4 +2976,3 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getOperandsMapping(OpdsMapping), MI.getNumOperands()); } - diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index f3a96e2a6128..a14b74961118 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -13,6 +13,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -23,7 +25,9 @@ namespace llvm { class LLT; +class GCNSubtarget; class MachineIRBuilder; +class SIInstrInfo; class SIRegisterInfo; class TargetRegisterInfo; @@ -36,9 +40,27 @@ protected: #include "AMDGPUGenRegisterBank.inc" }; class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { + const GCNSubtarget &Subtarget; const SIRegisterInfo *TRI; - - void executeInWaterfallLoop(MachineInstr &MI, + const SIInstrInfo *TII; + + bool collectWaterfallOperands( + SmallSet<Register, 4> &SGPROperandRegs, + MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const; + + bool executeInWaterfallLoop( + MachineIRBuilder &B, + iterator_range<MachineBasicBlock::iterator> Range, + SmallSet<Register, 4> &SGPROperandRegs, + MachineRegisterInfo &MRI) const; + + bool executeInWaterfallLoop(MachineIRBuilder &B, + MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef<unsigned> OpIndices) const; + bool executeInWaterfallLoop(MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const; @@ -47,6 +69,19 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { bool applyMappingWideLoad(MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, MachineRegisterInfo &MRI) const; + bool + applyMappingImage(MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI, int RSrcIdx) const; + + Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, + Register Reg) const; + + std::pair<Register, unsigned> + splitBufferOffsets(MachineIRBuilder &B, Register Offset) const; + + MachineInstr *selectStoreIntrinsic(MachineIRBuilder &B, + MachineInstr &MI) const; /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; @@ -58,6 +93,16 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const TargetRegisterInfo &TRI, unsigned Default = AMDGPU::VGPRRegBankID) const; + // Return a value mapping for an operand that is required to be an SGPR. + const ValueMapping *getSGPROpMapping(Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + + // Return a value mapping for an operand that is required to be a VGPR. + const ValueMapping *getVGPROpMapping(Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) const; + /// Split 64-bit value \p Reg into two 32-bit halves and populate them into \p /// Regs. This appropriately sets the regbank of the new registers. void split64BitValueForMapping(MachineIRBuilder &B, @@ -90,8 +135,13 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const; const InstructionMapping &getDefaultMappingAllVGPR( const MachineInstr &MI) const; + + const InstructionMapping &getImageMapping(const MachineRegisterInfo &MRI, + const MachineInstr &MI, + int RsrcIdx) const; + public: - AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI); + AMDGPURegisterBankInfo(const GCNSubtarget &STI); unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 9555694fb106..00f53b157577 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -7,14 +7,14 @@ //===----------------------------------------------------------------------===// def SGPRRegBank : RegisterBank<"SGPR", - [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512] + [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512, SReg_1024] >; def VGPRRegBank : RegisterBank<"VGPR", - [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512] + [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024] >; def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>; // It is helpful to distinguish conditions from ordinary SGPRs. -def VCCRegBank : RegisterBank <"VCC", [SReg_64]>; +def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 7cffdf1a4dcf..9806e6b0714f 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -26,19 +26,59 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} // they are not supported at this time. //===----------------------------------------------------------------------===// -unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) { - static const unsigned SubRegs[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, - AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, - AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, - AMDGPU::sub15, AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, - AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, AMDGPU::sub24, - AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, AMDGPU::sub28, AMDGPU::sub29, - AMDGPU::sub30, AMDGPU::sub31 - }; - - assert(Channel < array_lengthof(SubRegs)); - return SubRegs[Channel]; +// Table of NumRegs sized pieces at every 32-bit offset. +static const uint16_t SubRegFromChannelTable[][32] = { + { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, + AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, + AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, + AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31 + }, + { + AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, AMDGPU::sub3_sub4, + AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, AMDGPU::sub6_sub7, AMDGPU::sub7_sub8, + AMDGPU::sub8_sub9, AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12, + AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, AMDGPU::sub15_sub16, + AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, AMDGPU::sub18_sub19, AMDGPU::sub19_sub20, + AMDGPU::sub20_sub21, AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24, + AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, AMDGPU::sub27_sub28, + AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, AMDGPU::sub30_sub31, AMDGPU::NoSubRegister + }, + { + AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5, + AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9, + AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13, + AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17, + AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21, + AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25, + AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29, + AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister + }, + { + AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6, + AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10, + AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14, + AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18, + AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22, + AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26, + AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30, + AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister + } +}; + +// FIXME: TableGen should generate something to make this manageable for all +// register classes. At a minimum we could use the opposite of +// composeSubRegIndices and go up from the base 32-bit subreg. +unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel, unsigned NumRegs) { + const unsigned NumRegIndex = NumRegs - 1; + + assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) && + "Not implemented"); + assert(Channel < array_lengthof(SubRegFromChannelTable[0])); + return SubRegFromChannelTable[NumRegIndex][Channel]; } void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index 3453a8c1b0b3..9e713ca804a1 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -28,7 +28,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) - static unsigned getSubRegFromChannel(unsigned Channel); + static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1); void reserveRegisterTuples(BitVector &, unsigned Reg) const; }; diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td index f8703c36127a..26b8b7840270 100644 --- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -81,6 +81,8 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_and>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>; @@ -92,6 +94,8 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umax>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_and>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_ps_live>; def : SourceOfDivergence<int_amdgcn_ds_swizzle>; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 1eb9b83456c5..3bb6dd4571c0 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -175,6 +175,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : HasFminFmaxLegacy(true), EnablePromoteAlloca(false), HasTrigReducedRange(false), + MaxWavesPerEU(10), LocalMemorySize(0), WavefrontSize(0) { } @@ -261,6 +262,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, AddNoCarryInsts(false), HasUnpackedD16VMem(false), LDSMisalignedBug(false), + HasMFMAInlineLiteralBug(false), ScalarizeGlobal(false), @@ -278,9 +280,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { + MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); - RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); + RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); InstSelector.reset(new AMDGPUInstructionSelector( *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); } @@ -489,28 +492,28 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { } uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, - unsigned &MaxAlign) const { + Align &MaxAlign) const { assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || F.getCallingConv() == CallingConv::SPIR_KERNEL); const DataLayout &DL = F.getParent()->getDataLayout(); uint64_t ExplicitArgBytes = 0; - MaxAlign = 1; + MaxAlign = Align::None(); for (const Argument &Arg : F.args()) { Type *ArgTy = Arg.getType(); - unsigned Align = DL.getABITypeAlignment(ArgTy); + const Align Alignment(DL.getABITypeAlignment(ArgTy)); uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); - ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; - MaxAlign = std::max(MaxAlign, Align); + ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; + MaxAlign = std::max(MaxAlign, Alignment); } return ExplicitArgBytes; } unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, - unsigned &MaxAlign) const { + Align &MaxAlign) const { uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); unsigned ExplicitOffset = getExplicitKernelArgOffset(F); @@ -518,7 +521,7 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; unsigned ImplicitBytes = getImplicitArgNumBytes(F); if (ImplicitBytes != 0) { - unsigned Alignment = getAlignmentForImplicitArgPtr(); + const Align Alignment = getAlignmentForImplicitArgPtr(); TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; } @@ -566,7 +569,7 @@ bool GCNSubtarget::hasMadF16() const { unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) - return 10; + return getMaxWavesPerEU(); if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) @@ -591,25 +594,12 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { } unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { - if (VGPRs <= 24) - return 10; - if (VGPRs <= 28) - return 9; - if (VGPRs <= 32) - return 8; - if (VGPRs <= 36) - return 7; - if (VGPRs <= 40) - return 6; - if (VGPRs <= 48) - return 5; - if (VGPRs <= 64) - return 4; - if (VGPRs <= 84) - return 3; - if (VGPRs <= 128) - return 2; - return 1; + unsigned MaxWaves = getMaxWavesPerEU(); + unsigned Granule = getVGPRAllocGranule(); + if (VGPRs < Granule) + return MaxWaves; + unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; + return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); } unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { @@ -629,6 +619,20 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { return 2; // VCC. } +unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, + unsigned LDSSize, + unsigned NumSGPRs, + unsigned NumVGPRs) const { + unsigned Occupancy = + std::min(getMaxWavesPerEU(), + getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); + if (NumSGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); + if (NumVGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); + return Occupancy; +} + unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { const Function &F = MF.getFunction(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); @@ -878,8 +882,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { void GCNSubtarget::getPostRAMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { - Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); - Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo)); + Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo)); + Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); } const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 78c3b823946d..936feb00c62b 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -75,6 +75,7 @@ protected: bool HasFminFmaxLegacy; bool EnablePromoteAlloca; bool HasTrigReducedRange; + unsigned MaxWavesPerEU; int LocalMemorySize; unsigned WavefrontSize; @@ -195,8 +196,8 @@ public: return LocalMemorySize; } - unsigned getAlignmentForImplicitArgPtr() const { - return isAmdHsaOS() ? 8 : 4; + Align getAlignmentForImplicitArgPtr() const { + return isAmdHsaOS() ? Align(8) : Align(4); } /// Returns the offset in bytes from the start of the input buffer @@ -223,7 +224,9 @@ public: /// subtarget. virtual unsigned getMinWavesPerEU() const = 0; - unsigned getMaxWavesPerEU() const { return 10; } + /// \returns Maximum number of waves per execution unit supported by the + /// subtarget without any kind of limitation. + unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } /// Creates value range metadata on an workitemid.* inrinsic call or load. bool makeLIDRangeMetadata(Instruction *I) const; @@ -235,16 +238,17 @@ public: return 16; return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); } - uint64_t getExplicitKernArgSize(const Function &F, - unsigned &MaxAlign) const; - unsigned getKernArgSegmentSize(const Function &F, - unsigned &MaxAlign) const; + uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; + unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; virtual ~AMDGPUSubtarget() {} }; class GCNSubtarget : public AMDGPUGenSubtargetInfo, public AMDGPUSubtarget { + + using AMDGPUSubtarget::getMaxWavesPerEU; + public: enum TrapHandlerAbi { TrapHandlerAbiNone = 0, @@ -362,6 +366,7 @@ protected: bool CaymanISA; bool CFALUBug; bool LDSMisalignedBug; + bool HasMFMAInlineLiteralBug; bool HasVertexCache; short TexVTXClauseSize; bool ScalarizeGlobal; @@ -416,7 +421,7 @@ public: return CallLoweringInfo.get(); } - const InstructionSelector *getInstructionSelector() const override { + InstructionSelector *getInstructionSelector() const override { return InstSelector.get(); } @@ -544,6 +549,14 @@ public: return GFX9Insts; } + bool hasScalarPackInsts() const { + return GFX9Insts; + } + + bool hasScalarMulHiInsts() const { + return GFX9Insts; + } + TrapHandlerAbi getTrapHandlerAbi() const { return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } @@ -611,6 +624,11 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX9; } + /// \returns If target supports S_DENORM_MODE. + bool hasDenormModeInst() const { + return getGeneration() >= AMDGPUSubtarget::GFX10; + } + bool useFlatForGlobal() const { return FlatForGlobal; } @@ -848,9 +866,7 @@ public: // on the pointer value itself may rely on the alignment / known low bits of // the pointer. Set this to something above the minimum to avoid needing // dynamic realignment in common cases. - unsigned getStackAlignment() const { - return 16; - } + Align getStackAlignment() const { return Align(16); } bool enableMachineScheduler() const override { return true; @@ -881,12 +897,6 @@ public: return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize); } - /// \returns Maximum number of waves per execution unit supported by the - /// subtarget without any kind of limitation. - unsigned getMaxWavesPerEU() const { - return AMDGPU::IsaInfo::getMaxWavesPerEU(); - } - /// \returns Number of waves per work group supported by the subtarget and /// limited by given \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { @@ -944,6 +954,14 @@ public: return HasDPP; } + bool hasDPPBroadcasts() const { + return HasDPP && getGeneration() < GFX10; + } + + bool hasDPPWavefrontShifts() const { + return HasDPP && getGeneration() < GFX10; + } + bool hasDPP8() const { return HasDPP8; } @@ -974,6 +992,10 @@ public: return SGPRInitBug; } + bool hasMFMAInlineLiteralBug() const { + return HasMFMAInlineLiteralBug; + } + bool has12DWordStoreHazard() const { return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; } @@ -1036,6 +1058,13 @@ public: /// VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; + /// Return occupancy for the given function. Used LDS and a number of + /// registers if provided. + /// Note, occupancy can be affected by the scratch allocation as well, but + /// we do not have enough information to compute it. + unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0, + unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; + /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { @@ -1226,9 +1255,7 @@ public: return Gen; } - unsigned getStackAlignment() const { - return 4; - } + Align getStackAlignment() const { return Align(4); } R600Subtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS); diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0ea8db04c298..e8cf77161a14 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -238,16 +238,17 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUInlinerPass(*PR); + initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { - return llvm::make_unique<AMDGPUTargetObjectFile>(); + return std::make_unique<AMDGPUTargetObjectFile>(); } static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>()); + return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>()); } static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { @@ -257,7 +258,7 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = - new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C)); + new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); @@ -412,6 +413,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { PM.add(createAMDGPUExternalAAWrapperPass()); } PM.add(createAMDGPUUnifyMetadataPass()); + PM.add(createAMDGPUPrintfRuntimeBinding()); PM.add(createAMDGPUPropagateAttributesLatePass(this)); if (Internalize) { PM.add(createInternalizePass(mustPreserveGV)); @@ -482,7 +484,7 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl( // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); + I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); } return I.get(); @@ -518,7 +520,7 @@ const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); + I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); } I->setScalarizeGlobalBehavior(ScalarizeGlobal); @@ -659,6 +661,8 @@ void AMDGPUPassConfig::addIRPasses() { disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + addPass(createAMDGPUPrintfRuntimeBinding()); + // This must occur before inlining, as the inliner will not look through // bitcast calls. addPass(createAMDGPUFixFunctionBitcastsPass()); @@ -681,12 +685,6 @@ void AMDGPUPassConfig::addIRPasses() { // without ever running any passes on the second. addPass(createBarrierNoopPass()); - if (TM.getTargetTriple().getArch() == Triple::amdgcn) { - // TODO: May want to move later or split into an early and late one. - - addPass(createAMDGPUCodeGenPreparePass()); - } - // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. if (TM.getTargetTriple().getArch() == Triple::r600) addPass(createR600OpenCLImageTypeLoweringPass()); @@ -714,6 +712,11 @@ void AMDGPUPassConfig::addIRPasses() { } } + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { + // TODO: May want to move later or split into an early and late one. + addPass(createAMDGPUCodeGenPreparePass()); + } + TargetPassConfig::addIRPasses(); // EarlyCSE is not always strong enough to clean up what LSR produces. For @@ -1046,7 +1049,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo( return true; if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && - !AMDGPU::SReg_128RegClass.contains(MFI->ScratchRSrcReg)) { + !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); } @@ -1095,7 +1098,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo( if (YamlMFI.ArgInfo && (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, - AMDGPU::SReg_128RegClass, + AMDGPU::SGPR_128RegClass, MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index aaed280a1270..616196ad5ba3 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -57,7 +57,7 @@ using namespace llvm; static cl::opt<unsigned> UnrollThresholdPrivate( "amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), - cl::init(2500), cl::Hidden); + cl::init(2000), cl::Hidden); static cl::opt<unsigned> UnrollThresholdLocal( "amdgpu-unroll-threshold-local", @@ -590,6 +590,61 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { return false; } +bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, + Intrinsic::ID IID) const { + switch (IID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: + OpIndexes.push_back(0); + return true; + default: + return false; + } +} + +bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace( + IntrinsicInst *II, Value *OldV, Value *NewV) const { + auto IntrID = II->getIntrinsicID(); + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { + const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4)); + if (!IsVolatile->isZero()) + return false; + Module *M = II->getParent()->getParent()->getParent(); + Type *DestTy = II->getType(); + Type *SrcTy = NewV->getType(); + Function *NewDecl = + Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy}); + II->setArgOperand(0, NewV); + II->setCalledFunction(NewDecl); + return true; + } + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: { + unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ? + AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; + unsigned NewAS = NewV->getType()->getPointerAddressSpace(); + LLVMContext &Ctx = NewV->getType()->getContext(); + ConstantInt *NewVal = (TrueAS == NewAS) ? + ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); + II->replaceAllUsesWith(NewVal); + II->eraseFromParent(); + return true; + } + default: + return false; + } +} + unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { if (ST->hasVOP3PInsts()) { @@ -638,6 +693,39 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, CommonTTI.getUnrollingPreferences(L, SE, UP); } +unsigned GCNTTIImpl::getUserCost(const User *U, + ArrayRef<const Value *> Operands) { + // Estimate extractelement elimination + if (const ExtractElementInst *EE = dyn_cast<ExtractElementInst>(U)) { + ConstantInt *CI = dyn_cast<ConstantInt>(EE->getOperand(1)); + unsigned Idx = -1; + if (CI) + Idx = CI->getZExtValue(); + return getVectorInstrCost(EE->getOpcode(), EE->getOperand(0)->getType(), + Idx); + } + + // Estimate insertelement elimination + if (const InsertElementInst *IE = dyn_cast<InsertElementInst>(U)) { + ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2)); + unsigned Idx = -1; + if (CI) + Idx = CI->getZExtValue(); + return getVectorInstrCost(IE->getOpcode(), IE->getType(), Idx); + } + + // Estimate different intrinsics, e.g. llvm.fabs + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { + SmallVector<Value *, 4> Args(II->arg_operands()); + FastMathFlags FMF; + if (auto *FPMO = dyn_cast<FPMathOperator>(II)) + FMF = FPMO->getFastMathFlags(); + return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args, + FMF); + } + return BaseT::getUserCost(U, Operands); +} + unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 6f1bf5a26f0d..67f7f9074f10 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -46,10 +46,18 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { Triple TargetTriple; + const TargetSubtargetInfo *ST; + const TargetLoweringBase *TLI; + + const TargetSubtargetInfo *getST() const { return ST; } + const TargetLoweringBase *getTLI() const { return TLI; } + public: explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) - : BaseT(TM, F.getParent()->getDataLayout()), - TargetTriple(TM->getTargetTriple()) {} + : BaseT(TM, F.getParent()->getDataLayout()), + TargetTriple(TM->getTargetTriple()), + ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), + TLI(ST->getTargetLowering()) {} void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); @@ -183,6 +191,11 @@ public: return AMDGPUAS::FLAT_ADDRESS; } + bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, + Intrinsic::ID IID) const; + bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, + Value *OldV, Value *NewV) const; + unsigned getVectorSplitCost() { return 0; } unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, @@ -191,7 +204,7 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - unsigned getInliningThresholdMultiplier() { return 7; } + unsigned getInliningThresholdMultiplier() { return 9; } int getInlinerVectorBonusPercent() { return 0; } @@ -201,6 +214,7 @@ public: int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned); + unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands); }; class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> { diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 12f2e9519c9e..101ecfc0c87c 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1307,8 +1307,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, if (LandBlkHasOtherPred) { report_fatal_error("Extra register needed to handle CFG"); - unsigned CmpResReg = - HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); + Register CmpResReg = + HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); report_fatal_error("Extra compare instruction needed to handle CFG"); insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, CmpResReg, DebugLoc()); @@ -1316,8 +1316,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // XXX: We are running this after RA, so creating virtual registers will // cause an assertion failure in the PostRA scheduling pass. - unsigned InitReg = - HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); + Register InitReg = + HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg, DebugLoc()); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 6d678966c98e..9dd511fab57c 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -143,6 +143,7 @@ public: ImmTyDLC, ImmTyGLC, ImmTySLC, + ImmTySWZ, ImmTyTFE, ImmTyD16, ImmTyClampSI, @@ -216,14 +217,15 @@ public: if (Kind == Token) return true; - if (Kind != Expression || !Expr) - return false; - // When parsing operands, we can't always tell if something was meant to be // a token, like 'gds', or an expression that references a global variable. // In this case, we assume the string is an expression, and if we need to // interpret is a token, then we treat the symbol name as the token. - return isa<MCSymbolRefExpr>(Expr); + return isSymbolRefExpr(); + } + + bool isSymbolRefExpr() const { + return isExpr() && Expr && isa<MCSymbolRefExpr>(Expr); } bool isImm() const override { @@ -274,8 +276,10 @@ public: isRegClass(AMDGPU::VReg_64RegClassID) || isRegClass(AMDGPU::VReg_96RegClassID) || isRegClass(AMDGPU::VReg_128RegClassID) || + isRegClass(AMDGPU::VReg_160RegClassID) || isRegClass(AMDGPU::VReg_256RegClassID) || - isRegClass(AMDGPU::VReg_512RegClassID); + isRegClass(AMDGPU::VReg_512RegClassID) || + isRegClass(AMDGPU::VReg_1024RegClassID); } bool isVReg32() const { @@ -286,6 +290,10 @@ public: return isOff() || isVReg32(); } + bool isNull() const { + return isRegKind() && getReg() == AMDGPU::SGPR_NULL; + } + bool isSDWAOperand(MVT type) const; bool isSDWAFP16Operand() const; bool isSDWAFP32Operand() const; @@ -325,6 +333,7 @@ public: bool isDLC() const { return isImmTy(ImmTyDLC); } bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } + bool isSWZ() const { return isImmTy(ImmTySWZ); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isD16() const { return isImmTy(ImmTyD16); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); } @@ -817,6 +826,7 @@ public: case ImmTyDLC: OS << "DLC"; break; case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; + case ImmTySWZ: OS << "SWZ"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -886,7 +896,7 @@ public: int64_t Val, SMLoc Loc, ImmTy Type = ImmTyNone, bool IsFPImm = false) { - auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser); + auto Op = std::make_unique<AMDGPUOperand>(Immediate, AsmParser); Op->Imm.Val = Val; Op->Imm.IsFPImm = IsFPImm; Op->Imm.Type = Type; @@ -899,7 +909,7 @@ public: static AMDGPUOperand::Ptr CreateToken(const AMDGPUAsmParser *AsmParser, StringRef Str, SMLoc Loc, bool HasExplicitEncodingSize = true) { - auto Res = llvm::make_unique<AMDGPUOperand>(Token, AsmParser); + auto Res = std::make_unique<AMDGPUOperand>(Token, AsmParser); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); Res->StartLoc = Loc; @@ -910,7 +920,7 @@ public: static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser, unsigned RegNo, SMLoc S, SMLoc E) { - auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser); + auto Op = std::make_unique<AMDGPUOperand>(Register, AsmParser); Op->Reg.RegNo = RegNo; Op->Reg.Mods = Modifiers(); Op->StartLoc = S; @@ -920,7 +930,7 @@ public: static AMDGPUOperand::Ptr CreateExpr(const AMDGPUAsmParser *AsmParser, const class MCExpr *Expr, SMLoc S) { - auto Op = llvm::make_unique<AMDGPUOperand>(Expression, AsmParser); + auto Op = std::make_unique<AMDGPUOperand>(Expression, AsmParser); Op->Expr = Expr; Op->StartLoc = S; Op->EndLoc = S; @@ -1051,11 +1061,23 @@ private: std::string &CollectString); bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, - RegisterKind RegKind, unsigned Reg1, - unsigned RegNum); + RegisterKind RegKind, unsigned Reg1); bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, - unsigned& RegNum, unsigned& RegWidth, - unsigned *DwordRegIndex); + unsigned& RegNum, unsigned& RegWidth); + unsigned ParseRegularReg(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth); + unsigned ParseSpecialReg(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth); + unsigned ParseRegList(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth); + bool ParseRegRange(unsigned& Num, unsigned& Width); + unsigned getRegularReg(RegisterKind RegKind, + unsigned RegNum, + unsigned RegWidth); + bool isRegister(); bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const; Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind); @@ -1306,6 +1328,7 @@ private: bool validateOpSel(const MCInst &Inst); bool validateVccOperand(unsigned Reg) const; bool validateVOP3Literal(const MCInst &Inst) const; + unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; @@ -1321,6 +1344,7 @@ private: void peekTokens(MutableArrayRef<AsmToken> Tokens); AsmToken::TokenKind getTokenKind() const; bool parseExpr(int64_t &Imm); + bool parseExpr(OperandVector &Operands); StringRef getTokenStr() const; AsmToken peekToken(); AsmToken getToken() const; @@ -1399,9 +1423,12 @@ public: void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands); + void cvtSdwaVOP2e(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands); void cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType, bool skipVcc = false); + uint64_t BasicInstType, + bool SkipDstVcc = false, + bool SkipSrcVcc = false); AMDGPUOperand::Ptr defaultBLGP() const; AMDGPUOperand::Ptr defaultCBSZ() const; @@ -1636,8 +1663,8 @@ bool AMDGPUOperand::isSDWAInt32Operand() const { } bool AMDGPUOperand::isBoolReg() const { - return AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? - isSCSrcB64() : isSCSrcB32(); + return (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) || + (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32()); } uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const @@ -1849,6 +1876,8 @@ static bool isInlineValue(unsigned Reg) { case AMDGPU::SRC_EXECZ: case AMDGPU::SRC_SCC: return true; + case AMDGPU::SGPR_NULL: + return true; default: return false; } @@ -1870,8 +1899,10 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 2: return AMDGPU::VReg_64RegClassID; case 3: return AMDGPU::VReg_96RegClassID; case 4: return AMDGPU::VReg_128RegClassID; + case 5: return AMDGPU::VReg_160RegClassID; case 8: return AMDGPU::VReg_256RegClassID; case 16: return AMDGPU::VReg_512RegClassID; + case 32: return AMDGPU::VReg_1024RegClassID; } } else if (Is == IS_TTMP) { switch (RegWidth) { @@ -1944,7 +1975,7 @@ static unsigned getSpecialRegForName(StringRef RegName) { .Case("tba_lo", AMDGPU::TBA_LO) .Case("tba_hi", AMDGPU::TBA_HI) .Case("null", AMDGPU::SGPR_NULL) - .Default(0); + .Default(AMDGPU::NoRegister); } bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, @@ -1959,8 +1990,7 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, } bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, - RegisterKind RegKind, unsigned Reg1, - unsigned RegNum) { + RegisterKind RegKind, unsigned Reg1) { switch (RegKind) { case IS_SPECIAL: if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { @@ -2008,14 +2038,37 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, } } -static const StringRef Registers[] = { - { "v" }, - { "s" }, - { "ttmp" }, - { "acc" }, - { "a" }, +struct RegInfo { + StringLiteral Name; + RegisterKind Kind; +}; + +static constexpr RegInfo RegularRegisters[] = { + {{"v"}, IS_VGPR}, + {{"s"}, IS_SGPR}, + {{"ttmp"}, IS_TTMP}, + {{"acc"}, IS_AGPR}, + {{"a"}, IS_AGPR}, }; +static bool isRegularReg(RegisterKind Kind) { + return Kind == IS_VGPR || + Kind == IS_SGPR || + Kind == IS_TTMP || + Kind == IS_AGPR; +} + +static const RegInfo* getRegularRegInfo(StringRef Str) { + for (const RegInfo &Reg : RegularRegisters) + if (Str.startswith(Reg.Name)) + return &Reg; + return nullptr; +} + +static bool getRegNum(StringRef Str, unsigned& Num) { + return !Str.getAsInteger(10, Num); +} + bool AMDGPUAsmParser::isRegister(const AsmToken &Token, const AsmToken &NextToken) const { @@ -2029,24 +2082,24 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token, // A single register like s0 or a range of registers like s[0:1] - StringRef RegName = Token.getString(); - - for (StringRef Reg : Registers) { - if (RegName.startswith(Reg)) { - if (Reg.size() < RegName.size()) { - unsigned RegNum; - // A single register with an index: rXX - if (!RegName.substr(Reg.size()).getAsInteger(10, RegNum)) - return true; - } else { - // A range of registers: r[XX:YY]. - if (NextToken.is(AsmToken::LBrac)) - return true; - } + StringRef Str = Token.getString(); + const RegInfo *Reg = getRegularRegInfo(Str); + if (Reg) { + StringRef RegName = Reg->Name; + StringRef RegSuffix = Str.substr(RegName.size()); + if (!RegSuffix.empty()) { + unsigned Num; + // A single register with an index: rXX + if (getRegNum(RegSuffix, Num)) + return true; + } else { + // A range of registers: r[XX:YY]. + if (NextToken.is(AsmToken::LBrac)) + return true; } } - return getSpecialRegForName(RegName); + return getSpecialRegForName(Str) != AMDGPU::NoRegister; } bool @@ -2055,137 +2108,161 @@ AMDGPUAsmParser::isRegister() return isRegister(getToken(), peekToken()); } -bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, - unsigned &RegNum, unsigned &RegWidth, - unsigned *DwordRegIndex) { - if (DwordRegIndex) { *DwordRegIndex = 0; } +unsigned +AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, + unsigned RegNum, + unsigned RegWidth) { + + assert(isRegularReg(RegKind)); + + unsigned AlignSize = 1; + if (RegKind == IS_SGPR || RegKind == IS_TTMP) { + // SGPR and TTMP registers must be aligned. + // Max required alignment is 4 dwords. + AlignSize = std::min(RegWidth, 4u); + } + + if (RegNum % AlignSize != 0) + return AMDGPU::NoRegister; + + unsigned RegIdx = RegNum / AlignSize; + int RCID = getRegClass(RegKind, RegWidth); + if (RCID == -1) + return AMDGPU::NoRegister; + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - if (getLexer().is(AsmToken::Identifier)) { - StringRef RegName = Parser.getTok().getString(); - if ((Reg = getSpecialRegForName(RegName))) { - Parser.Lex(); - RegKind = IS_SPECIAL; - } else { - unsigned RegNumIndex = 0; - if (RegName[0] == 'v') { - RegNumIndex = 1; - RegKind = IS_VGPR; - } else if (RegName[0] == 's') { - RegNumIndex = 1; - RegKind = IS_SGPR; - } else if (RegName[0] == 'a') { - RegNumIndex = RegName.startswith("acc") ? 3 : 1; - RegKind = IS_AGPR; - } else if (RegName.startswith("ttmp")) { - RegNumIndex = strlen("ttmp"); - RegKind = IS_TTMP; - } else { - return false; - } - if (RegName.size() > RegNumIndex) { - // Single 32-bit register: vXX. - if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum)) - return false; - Parser.Lex(); - RegWidth = 1; - } else { - // Range of registers: v[XX:YY]. ":YY" is optional. - Parser.Lex(); - int64_t RegLo, RegHi; - if (getLexer().isNot(AsmToken::LBrac)) - return false; - Parser.Lex(); + const MCRegisterClass RC = TRI->getRegClass(RCID); + if (RegIdx >= RC.getNumRegs()) + return AMDGPU::NoRegister; - if (getParser().parseAbsoluteExpression(RegLo)) - return false; + return RC.getRegister(RegIdx); +} - const bool isRBrace = getLexer().is(AsmToken::RBrac); - if (!isRBrace && getLexer().isNot(AsmToken::Colon)) - return false; - Parser.Lex(); +bool +AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) { + int64_t RegLo, RegHi; + if (!trySkipToken(AsmToken::LBrac)) + return false; - if (isRBrace) { - RegHi = RegLo; - } else { - if (getParser().parseAbsoluteExpression(RegHi)) - return false; + if (!parseExpr(RegLo)) + return false; - if (getLexer().isNot(AsmToken::RBrac)) - return false; - Parser.Lex(); - } - RegNum = (unsigned) RegLo; - RegWidth = (RegHi - RegLo) + 1; - } - } - } else if (getLexer().is(AsmToken::LBrac)) { - // List of consecutive registers: [s0,s1,s2,s3] - Parser.Lex(); - if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, nullptr)) - return false; - if (RegWidth != 1) + if (trySkipToken(AsmToken::Colon)) { + if (!parseExpr(RegHi)) return false; - RegisterKind RegKind1; - unsigned Reg1, RegNum1, RegWidth1; - do { - if (getLexer().is(AsmToken::Comma)) { - Parser.Lex(); - } else if (getLexer().is(AsmToken::RBrac)) { - Parser.Lex(); - break; - } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1, nullptr)) { - if (RegWidth1 != 1) { - return false; - } - if (RegKind1 != RegKind) { - return false; - } - if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) { - return false; - } - } else { - return false; - } - } while (true); } else { - return false; + RegHi = RegLo; } - switch (RegKind) { - case IS_SPECIAL: + + if (!trySkipToken(AsmToken::RBrac)) + return false; + + if (!isUInt<32>(RegLo) || !isUInt<32>(RegHi) || RegLo > RegHi) + return false; + + Num = static_cast<unsigned>(RegLo); + Width = (RegHi - RegLo) + 1; + return true; +} + +unsigned +AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth) { + assert(isToken(AsmToken::Identifier)); + unsigned Reg = getSpecialRegForName(getTokenStr()); + if (Reg) { RegNum = 0; RegWidth = 1; - break; - case IS_VGPR: - case IS_SGPR: - case IS_AGPR: - case IS_TTMP: - { - unsigned Size = 1; - if (RegKind == IS_SGPR || RegKind == IS_TTMP) { - // SGPR and TTMP registers must be aligned. Max required alignment is 4 dwords. - Size = std::min(RegWidth, 4u); - } - if (RegNum % Size != 0) - return false; - if (DwordRegIndex) { *DwordRegIndex = RegNum; } - RegNum = RegNum / Size; - int RCID = getRegClass(RegKind, RegWidth); - if (RCID == -1) - return false; - const MCRegisterClass RC = TRI->getRegClass(RCID); - if (RegNum >= RC.getNumRegs()) - return false; - Reg = RC.getRegister(RegNum); - break; + RegKind = IS_SPECIAL; + lex(); // skip register name + } + return Reg; +} + +unsigned +AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth) { + assert(isToken(AsmToken::Identifier)); + StringRef RegName = getTokenStr(); + + const RegInfo *RI = getRegularRegInfo(RegName); + if (!RI) + return AMDGPU::NoRegister; + lex(); // skip register name + + RegKind = RI->Kind; + StringRef RegSuffix = RegName.substr(RI->Name.size()); + if (!RegSuffix.empty()) { + // Single 32-bit register: vXX. + if (!getRegNum(RegSuffix, RegNum)) + return AMDGPU::NoRegister; + RegWidth = 1; + } else { + // Range of registers: v[XX:YY]. ":YY" is optional. + if (!ParseRegRange(RegNum, RegWidth)) + return AMDGPU::NoRegister; } - default: - llvm_unreachable("unexpected register kind"); + return getRegularReg(RegKind, RegNum, RegWidth); +} + +unsigned +AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, + unsigned &RegNum, + unsigned &RegWidth) { + unsigned Reg = AMDGPU::NoRegister; + + if (!trySkipToken(AsmToken::LBrac)) + return AMDGPU::NoRegister; + + // List of consecutive registers, e.g.: [s0,s1,s2,s3] + + if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) + return AMDGPU::NoRegister; + if (RegWidth != 1) + return AMDGPU::NoRegister; + + for (; trySkipToken(AsmToken::Comma); ) { + RegisterKind NextRegKind; + unsigned NextReg, NextRegNum, NextRegWidth; + + if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth)) + return AMDGPU::NoRegister; + if (NextRegWidth != 1) + return AMDGPU::NoRegister; + if (NextRegKind != RegKind) + return AMDGPU::NoRegister; + if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg)) + return AMDGPU::NoRegister; } - if (!subtargetHasRegister(*TRI, Reg)) - return false; - return true; + if (!trySkipToken(AsmToken::RBrac)) + return AMDGPU::NoRegister; + + if (isRegularReg(RegKind)) + Reg = getRegularReg(RegKind, RegNum, RegWidth); + + return Reg; +} + +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, + unsigned &Reg, + unsigned &RegNum, + unsigned &RegWidth) { + Reg = AMDGPU::NoRegister; + + if (isToken(AsmToken::Identifier)) { + Reg = ParseSpecialReg(RegKind, RegNum, RegWidth); + if (Reg == AMDGPU::NoRegister) + Reg = ParseRegularReg(RegKind, RegNum, RegWidth); + } else { + Reg = ParseRegList(RegKind, RegNum, RegWidth); + } + + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg); } Optional<StringRef> @@ -2241,18 +2318,18 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { SMLoc StartLoc = Tok.getLoc(); SMLoc EndLoc = Tok.getEndLoc(); RegisterKind RegKind; - unsigned Reg, RegNum, RegWidth, DwordRegIndex; + unsigned Reg, RegNum, RegWidth; - if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) { + if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) { //FIXME: improve error messages (bug 41303). Error(StartLoc, "not a valid operand."); return nullptr; } if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) { - if (!updateGprCountSymbols(RegKind, DwordRegIndex, RegWidth)) + if (!updateGprCountSymbols(RegKind, RegNum, RegWidth)) return nullptr; } else - KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth); + KernelScope.usesRegister(RegKind, RegNum, RegWidth); return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc); } @@ -2648,7 +2725,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { case AMDGPU::VCC_LO: case AMDGPU::VCC_HI: case AMDGPU::M0: - case AMDGPU::SGPR_NULL: return Reg; default: break; @@ -2697,13 +2773,38 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, } } +unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const { + if (!isGFX10()) + return 1; + + switch (Opcode) { + // 64-bit shift instructions can use only one scalar value input + case AMDGPU::V_LSHLREV_B64: + case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHL_B64: + case AMDGPU::V_LSHRREV_B64: + case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHR_B64: + case AMDGPU::V_ASHRREV_I64: + case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHR_I64: + return 1; + default: + return 2; + } +} + bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { const MCOperand &MO = Inst.getOperand(OpIdx); if (MO.isImm()) { return !isInlineConstant(Inst, OpIdx); + } else if (MO.isReg()) { + auto Reg = MO.getReg(); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + return isSGPR(mc2PseudoReg(Reg), TRI) && Reg != SGPR_NULL; + } else { + return true; } - return !MO.isReg() || - isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo()); } bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { @@ -2782,10 +2883,7 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { } ConstantBusUseCount += NumLiterals; - if (isGFX10()) - return ConstantBusUseCount <= 2; - - return ConstantBusUseCount <= 1; + return ConstantBusUseCount <= getConstantBusLimit(Opcode); } bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) { @@ -3212,6 +3310,7 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const { const int OpIndices[] = { Src0Idx, Src1Idx }; + unsigned NumExprs = 0; unsigned NumLiterals = 0; uint32_t LiteralValue; @@ -3219,19 +3318,21 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const { if (OpIdx == -1) break; const MCOperand &MO = Inst.getOperand(OpIdx); - if (MO.isImm() && - // Exclude special imm operands (like that used by s_set_gpr_idx_on) - AMDGPU::isSISrcOperand(Desc, OpIdx) && - !isInlineConstant(Inst, OpIdx)) { - uint32_t Value = static_cast<uint32_t>(MO.getImm()); - if (NumLiterals == 0 || LiteralValue != Value) { - LiteralValue = Value; - ++NumLiterals; + // Exclude special imm operands (like that used by s_set_gpr_idx_on) + if (AMDGPU::isSISrcOperand(Desc, OpIdx)) { + if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) { + uint32_t Value = static_cast<uint32_t>(MO.getImm()); + if (NumLiterals == 0 || LiteralValue != Value) { + LiteralValue = Value; + ++NumLiterals; + } + } else if (MO.isExpr()) { + ++NumExprs; } } } - return NumLiterals <= 1; + return NumLiterals + NumExprs <= 1; } bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { @@ -3267,6 +3368,7 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const { const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + unsigned NumExprs = 0; unsigned NumLiterals = 0; uint32_t LiteralValue; @@ -3274,17 +3376,26 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const { if (OpIdx == -1) break; const MCOperand &MO = Inst.getOperand(OpIdx); - if (!MO.isImm() || !AMDGPU::isSISrcOperand(Desc, OpIdx)) + if (!MO.isImm() && !MO.isExpr()) + continue; + if (!AMDGPU::isSISrcOperand(Desc, OpIdx)) continue; - if (!isInlineConstant(Inst, OpIdx)) { + if (OpIdx == Src2Idx && (Desc.TSFlags & SIInstrFlags::IsMAI) && + getFeatureBits()[AMDGPU::FeatureMFMAInlineLiteralBug]) + return false; + + if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) { uint32_t Value = static_cast<uint32_t>(MO.getImm()); if (NumLiterals == 0 || LiteralValue != Value) { LiteralValue = Value; ++NumLiterals; } + } else if (MO.isExpr()) { + ++NumExprs; } } + NumLiterals += NumExprs; return !NumLiterals || (NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]); @@ -3607,37 +3718,44 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, Val, ValRange); - UserSGPRCount += 4; + if (Val) + UserSGPRCount += 4; } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val, ValRange); - UserSGPRCount += 2; + if (Val) + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_queue_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val, ValRange); - UserSGPRCount += 2; + if (Val) + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, Val, ValRange); - UserSGPRCount += 2; + if (Val) + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_dispatch_id") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val, ValRange); - UserSGPRCount += 2; + if (Val) + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val, ValRange); - UserSGPRCount += 2; + if (Val) + UserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_private_segment_size") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, Val, ValRange); - UserSGPRCount += 1; + if (Val) + UserSGPRCount += 1; } else if (ID == ".amdhsa_wavefront_size32") { if (IVersion.Major < 10) return getParser().Error(IDRange.Start, "directive requires gfx10+", @@ -5225,6 +5343,23 @@ AMDGPUAsmParser::parseExpr(int64_t &Imm) { } bool +AMDGPUAsmParser::parseExpr(OperandVector &Operands) { + SMLoc S = getLoc(); + + const MCExpr *Expr; + if (Parser.parseExpression(Expr)) + return false; + + int64_t IntVal; + if (Expr->evaluateAsAbsolute(IntVal)) { + Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S)); + } else { + Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); + } + return true; +} + +bool AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { if (isToken(AsmToken::String)) { Val = getToken().getStringContents(); @@ -5605,25 +5740,29 @@ bool AMDGPUOperand::isGPRIdxMode() const { OperandMatchResultTy AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - switch (getLexer().getKind()) { - default: return MatchOperand_ParseFail; - case AsmToken::Integer: { - int64_t Imm; - if (getParser().parseAbsoluteExpression(Imm)) - return MatchOperand_ParseFail; - Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S)); - return MatchOperand_Success; - } + // Make sure we are not parsing something + // that looks like a label or an expression but is not. + // This will improve error messages. + if (isRegister() || isModifier()) + return MatchOperand_NoMatch; - case AsmToken::Identifier: - Operands.push_back(AMDGPUOperand::CreateExpr(this, - MCSymbolRefExpr::create(getContext().getOrCreateSymbol( - Parser.getTok().getString()), getContext()), S)); - Parser.Lex(); - return MatchOperand_Success; + if (parseExpr(Operands)) { + + AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]); + assert(Opr.isImm() || Opr.isExpr()); + SMLoc Loc = Opr.getStartLoc(); + + // Currently we do not support arbitrary expressions as branch targets. + // Only labels and absolute expressions are accepted. + if (Opr.isExpr() && !Opr.isSymbolRefExpr()) { + Error(Loc, "expected an absolute expression or a label"); + } else if (Opr.isImm() && !Opr.isS16Imm()) { + Error(Loc, "expected a 16-bit signed jump offset"); + } } + + return MatchOperand_Success; // avoid excessive error messages } //===----------------------------------------------------------------------===// @@ -5908,6 +6047,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, + {"swz", AMDGPUOperand::ImmTySWZ, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"high", AMDGPUOperand::ImmTyHigh, true, nullptr}, @@ -5941,8 +6081,6 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { }; OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { - unsigned size = Operands.size(); - assert(size > 0); OperandMatchResultTy res = parseOptionalOpr(Operands); @@ -5957,17 +6095,13 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operan // to make sure autogenerated parser of custom operands never hit hardcoded // mandatory operands. - if (size == 1 || ((AMDGPUOperand &)*Operands[size - 1]).isRegKind()) { - - // We have parsed the first optional operand. - // Parse as many operands as necessary to skip all mandatory operands. + for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) { + if (res != MatchOperand_Success || + isToken(AsmToken::EndOfStatement)) + break; - for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) { - if (res != MatchOperand_Success || - getLexer().is(AsmToken::EndOfStatement)) break; - if (getLexer().is(AsmToken::Comma)) Parser.Lex(); - res = parseOptionalOpr(Operands); - } + trySkipToken(AsmToken::Comma); + res = parseOptionalOpr(Operands); } return res; @@ -6682,7 +6816,11 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) { } void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) { - cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true); + cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true, true); +} + +void AMDGPUAsmParser::cvtSdwaVOP2e(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, false, true); } void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { @@ -6690,11 +6828,14 @@ void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { } void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType, bool skipVcc) { + uint64_t BasicInstType, + bool SkipDstVcc, + bool SkipSrcVcc) { using namespace llvm::AMDGPU::SDWA; OptionalImmIndexMap OptionalIdx; - bool skippedVcc = false; + bool SkipVcc = SkipDstVcc || SkipSrcVcc; + bool SkippedVcc = false; unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); @@ -6704,19 +6845,21 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - if (skipVcc && !skippedVcc && Op.isReg() && + if (SkipVcc && !SkippedVcc && Op.isReg() && (Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) { // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3) // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand. // Skip VCC only if we didn't skip it on previous iteration. + // Note that src0 and src1 occupy 2 slots each because of modifiers. if (BasicInstType == SIInstrFlags::VOP2 && - (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) { - skippedVcc = true; + ((SkipDstVcc && Inst.getNumOperands() == 1) || + (SkipSrcVcc && Inst.getNumOperands() == 5))) { + SkippedVcc = true; continue; } else if (BasicInstType == SIInstrFlags::VOPC && Inst.getNumOperands() == 0) { - skippedVcc = true; + SkippedVcc = true; continue; } } @@ -6728,7 +6871,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } else { llvm_unreachable("Invalid operand type"); } - skippedVcc = false; + SkippedVcc = false; } if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx10 && @@ -6849,6 +6992,14 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand; case MCK_AttrChan: return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand; + case MCK_SReg_64: + case MCK_SReg_64_XEXEC: + // Null is defined as a 32-bit register but + // it should also be enabled with 64-bit operands. + // The following code enables it for SReg_64 operands + // used as source and destination. Remaining source + // operands are handled in isInlinableImm. + return Operand.isNull() ? Match_Success : Match_InvalidOperand; default: return Match_InvalidOperand; } diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 62a19d848af2..1b12550aed88 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -7,13 +7,13 @@ //===----------------------------------------------------------------------===// def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; -def MUBUFAddr64 : ComplexPattern<i64, 8, "SelectMUBUFAddr64">; +def MUBUFAddr64 : ComplexPattern<i64, 9, "SelectMUBUFAddr64">; def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; -def MUBUFOffset : ComplexPattern<i64, 7, "SelectMUBUFOffset">; +def MUBUFOffset : ComplexPattern<i64, 8, "SelectMUBUFOffset">; def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; @@ -54,6 +54,17 @@ class MTBUFAddr64Table <bit is_addr64, string Name> { // MTBUF classes //===----------------------------------------------------------------------===// +class MTBUFGetBaseOpcode<string Op> { + string ret = !subst("FORMAT_XY", "FORMAT_X", + !subst("FORMAT_XYZ", "FORMAT_X", + !subst("FORMAT_XYZW", "FORMAT_X", Op))); +} + +class getMTBUFElements<string Op> { + int ret = 1; +} + + class MTBUF_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> : InstSI<outs, ins, "", pattern>, @@ -67,6 +78,9 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, string Mnemonic = opName; string AsmOperands = asmOps; + Instruction Opcode = !cast<Instruction>(NAME); + Instruction BaseOpcode = !cast<Instruction>(MTBUFGetBaseOpcode<NAME>.ret); + let VM_CNT = 1; let EXP_CNT = 1; let MTBUF = 1; @@ -90,6 +104,7 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, bits<1> has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; + bits<4> elements = 0; } class MTBUF_Real <MTBUF_Pseudo ps> : @@ -126,17 +141,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc), + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc) + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc), + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc) + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); } @@ -181,51 +196,54 @@ class MTBUF_SetupAddr<int addrKind> { class MTBUF_Load_Pseudo <string opName, int addrKind, RegisterClass vdataClass, + int elems, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MTBUF_Pseudo<opName, (outs vdataClass:$vdata), getMTBUFIns<addrKindCopy>.ret, - " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz", pattern>, MTBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 1; let mayStore = 0; + let elements = elems; } multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, - ValueType load_vt = i32, + int elems, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { - def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format, - i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>, + i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems, [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, - i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>, + i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; + def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; + def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; - def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>; + def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; + def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; + def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; } } class MTBUF_Store_Pseudo <string opName, int addrKind, RegisterClass vdataClass, + int elems, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, @@ -233,39 +251,40 @@ class MTBUF_Store_Pseudo <string opName, : MTBUF_Pseudo<opName, (outs), getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret, - " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz", pattern>, MTBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 0; let mayStore = 1; + let elements = elems; } multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, - ValueType store_vt = i32, + int elems, ValueType store_vt = i32, SDPatternOperator st = null_frag> { - def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format, i1:$glc, - i1:$slc, i1:$tfe, i1:$dlc))]>, + i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems, [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i8:$format, i1:$glc, - i1:$slc, i1:$tfe, i1:$dlc))]>, + i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; + def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; + def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; - def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>; + def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; + def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; + def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; } } @@ -320,7 +339,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, bits<1> has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; - bits<4> dwords = 0; + bits<4> elements = 0; } class MUBUF_Real <MUBUF_Pseudo ps> : @@ -393,18 +412,30 @@ class getMUBUFInsDA<list<RegisterClass> vdataList, ); dag ret = !con( !if(!empty(vdataList), InsNoData, InsData), - !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc)) + !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz)) ); } -class getMUBUFDwords<RegisterClass regClass> { - string regClassAsInt = !cast<string>(regClass); +class getMUBUFElements<ValueType vt> { + // eq does not support ValueType for some reason. + string vtAsStr = !cast<string>(vt); + int ret = - !if(!eq(regClassAsInt, !cast<string>(VGPR_32)), 1, - !if(!eq(regClassAsInt, !cast<string>(VReg_64)), 2, - !if(!eq(regClassAsInt, !cast<string>(VReg_96)), 3, - !if(!eq(regClassAsInt, !cast<string>(VReg_128)), 4, - 0)))); + !if(!eq(vtAsStr, "f16"), 1, + !if(!eq(vtAsStr, "v2f16"), 2, + !if(!eq(vtAsStr, "v3f16"), 3, + !if(!eq(vtAsStr, "v4f16"), 4, + !if(!eq(vt.Size, 32), 1, + !if(!eq(vt.Size, 64), 2, + !if(!eq(vt.Size, 96), 3, + !if(!eq(vt.Size, 128), 4, 0) + ) + ) + ) + ) + ) + ) + ); } class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> { @@ -442,18 +473,18 @@ class MUBUF_SetupAddr<int addrKind> { class MUBUF_Load_Pseudo <string opName, int addrKind, - RegisterClass vdataClass, + ValueType vdata_vt, bit HasTiedDest = 0, bit isLds = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MUBUF_Pseudo<opName, - (outs vdataClass:$vdata), + (outs getVregSrcForVT<vdata_vt>.ret:$vdata), !con(getMUBUFIns<addrKindCopy, [], isLds>.ret, - !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))), + !if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))), " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" # - !if(isLds, " lds", "$tfe") # "$dlc", + !if(isLds, " lds", "$tfe") # "$dlc" # "$swz", pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # !if(isLds, "_lds", "") # @@ -467,19 +498,19 @@ class MUBUF_Load_Pseudo <string opName, let Uses = !if(isLds, [EXEC, M0], [EXEC]); let has_tfe = !if(isLds, 0, 1); let lds = isLds; - let dwords = getMUBUFDwords<vdataClass>.ret; + let elements = getMUBUFElements<vdata_vt>.ret; } class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < - (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) + (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) >; class MUBUF_Addr64_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < - (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) + (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) >; multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { @@ -490,89 +521,87 @@ multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPa // FIXME: tfe can't be an operand because it requires a separate // opcode because it needs an N+1 register class dest register. -multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, +multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32, SDPatternOperator ld = null_frag, bit TiedDest = 0, bit isLds = 0> { - def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>, + def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>, MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>; - def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, TiedDest, isLds>, + def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds>, MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>; - def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>; - def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>; - def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>; + def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>; + def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>; + def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>; - def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>; - def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>; - def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>; + def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>; + def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>; + def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>; + def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>; } } -multiclass MUBUF_Pseudo_Loads_Lds<string opName, RegisterClass vdataClass, - ValueType load_vt = i32, +multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32, SDPatternOperator ld_nolds = null_frag, SDPatternOperator ld_lds = null_frag> { - defm NAME : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_nolds>; - defm _LDS : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_lds, 0, 1>; + defm NAME : MUBUF_Pseudo_Loads<opName, load_vt, ld_nolds>; + defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, ld_lds, 0, 1>; } class MUBUF_Store_Pseudo <string opName, int addrKind, - RegisterClass vdataClass, + ValueType store_vt, list<dag> pattern=[], // Workaround bug bz30254 - int addrKindCopy = addrKind, - RegisterClass vdataClassCopy = vdataClass> + int addrKindCopy = addrKind> : MUBUF_Pseudo<opName, (outs), - getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret, - " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc", + getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret]>.ret, + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz", pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 0; let mayStore = 1; let maybeAtomic = 1; - let dwords = getMUBUFDwords<vdataClass>.ret; + let elements = getMUBUFElements<store_vt>.ret; } -multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, +multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32, SDPatternOperator st = null_frag> { - def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt, [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MUBUFAddr64Table<0, NAME>; - def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt, [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MUBUFAddr64Table<1, NAME>; - def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>; + def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>; + def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; - def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt>; + def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>; + def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>; + def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>; } } class MUBUF_Pseudo_Store_Lds<string opName> : MUBUF_Pseudo<opName, (outs), - (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc), - " $srsrc, $soffset$offset lds$glc$slc"> { + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz), + " $srsrc, $soffset$offset lds$glc$slc$swz"> { let mayLoad = 0; let mayStore = 1; let maybeAtomic = 1; @@ -686,7 +715,7 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, RegisterClass vdataClass, ValueType vdataType, SDPatternOperator atomic, - bit isFP = getIsFP<vdataType>.ret> { + bit isFP = isFloatType<vdataType>.ret> { let FPAtomic = isFP in def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>, MUBUFAddr64Table <0, NAME>; @@ -710,7 +739,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName, RegisterClass vdataClass, ValueType vdataType, SDPatternOperator atomic, - bit isFP = getIsFP<vdataType>.ret> { + bit isFP = isFloatType<vdataType>.ret> { let FPAtomic = isFP in def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(set vdataType:$vdata, @@ -748,107 +777,107 @@ multiclass MUBUF_Pseudo_Atomics <string opName, //===----------------------------------------------------------------------===// defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads_Lds < - "buffer_load_format_x", VGPR_32 + "buffer_load_format_x", f32 >; defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads < - "buffer_load_format_xy", VReg_64 + "buffer_load_format_xy", v2f32 >; defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Pseudo_Loads < - "buffer_load_format_xyz", VReg_96 + "buffer_load_format_xyz", v3f32 >; defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads < - "buffer_load_format_xyzw", VReg_128 + "buffer_load_format_xyzw", v4f32 >; defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores < - "buffer_store_format_x", VGPR_32 + "buffer_store_format_x", f32 >; defm BUFFER_STORE_FORMAT_XY : MUBUF_Pseudo_Stores < - "buffer_store_format_xy", VReg_64 + "buffer_store_format_xy", v2f32 >; defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores < - "buffer_store_format_xyz", VReg_96 + "buffer_store_format_xyz", v3f32 >; defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores < - "buffer_store_format_xyzw", VReg_128 + "buffer_store_format_xyzw", v4f32 >; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_x", VGPR_32 + "buffer_load_format_d16_x", i32 >; defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xy", VReg_64 + "buffer_load_format_d16_xy", v2i32 >; defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xyz", VReg_96 + "buffer_load_format_d16_xyz", v3i32 >; defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xyzw", VReg_128 + "buffer_load_format_d16_xyzw", v4i32 >; defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_x", VGPR_32 + "buffer_store_format_d16_x", i32 >; defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xy", VReg_64 + "buffer_store_format_d16_xy", v2i32 >; defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xyz", VReg_96 + "buffer_store_format_d16_xyz", v3i32 >; defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xyzw", VReg_128 + "buffer_store_format_d16_xyzw", v4i32 >; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_x", VGPR_32 + "buffer_load_format_d16_x", f16 >; defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xy", VGPR_32 + "buffer_load_format_d16_xy", v2f16 >; defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xyz", VReg_64 + "buffer_load_format_d16_xyz", v3f16 >; defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_xyzw", VReg_64 + "buffer_load_format_d16_xyzw", v4f16 >; defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_x", VGPR_32 + "buffer_store_format_d16_x", f16 >; defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xy", VGPR_32 + "buffer_store_format_d16_xy", v2f16 >; defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xyz", VReg_64 + "buffer_store_format_d16_xyz", v3f16 >; defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_xyzw", VReg_64 + "buffer_store_format_d16_xyzw", v4f16 >; } // End HasPackedD16VMem. defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds < - "buffer_load_ubyte", VGPR_32, i32 + "buffer_load_ubyte", i32 >; defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds < - "buffer_load_sbyte", VGPR_32, i32 + "buffer_load_sbyte", i32 >; defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds < - "buffer_load_ushort", VGPR_32, i32 + "buffer_load_ushort", i32 >; defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds < - "buffer_load_sshort", VGPR_32, i32 + "buffer_load_sshort", i32 >; defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds < - "buffer_load_dword", VGPR_32, i32 + "buffer_load_dword", i32 >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx2", VReg_64, v2i32 + "buffer_load_dwordx2", v2i32 >; defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx3", VReg_96, v3i32 + "buffer_load_dwordx3", v3i32 >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx4", VReg_128, v4i32 + "buffer_load_dwordx4", v4i32 >; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>; @@ -867,111 +896,111 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>; // in at least GFX8+ chips. See Bug 37653. let SubtargetPredicate = isGFX8GFX9 in { defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads < - "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1 + "buffer_load_dwordx2", v2i32, null_frag, 0, 1 >; defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads < - "buffer_load_dwordx3", VReg_96, untyped, null_frag, 0, 1 + "buffer_load_dwordx3", v3i32, null_frag, 0, 1 >; defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads < - "buffer_load_dwordx4", VReg_128, v4i32, null_frag, 0, 1 + "buffer_load_dwordx4", v4i32, null_frag, 0, 1 >; } defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores < - "buffer_store_byte", VGPR_32, i32, truncstorei8_global + "buffer_store_byte", i32, truncstorei8_global >; defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores < - "buffer_store_short", VGPR_32, i32, truncstorei16_global + "buffer_store_short", i32, truncstorei16_global >; defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores < - "buffer_store_dword", VGPR_32, i32, store_global + "buffer_store_dword", i32, store_global >; defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx2", VReg_64, v2i32, store_global + "buffer_store_dwordx2", v2i32, store_global >; defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx3", VReg_96, v3i32, store_global + "buffer_store_dwordx3", v3i32, store_global >; defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx4", VReg_128, v4i32, store_global + "buffer_store_dwordx4", v4i32, store_global >; defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics < - "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global + "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global_32 >; defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics < "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag >; defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics < - "buffer_atomic_add", VGPR_32, i32, atomic_add_global + "buffer_atomic_add", VGPR_32, i32, atomic_load_add_global_32 >; defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics < - "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global + "buffer_atomic_sub", VGPR_32, i32, atomic_load_sub_global_32 >; defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_smin", VGPR_32, i32, atomic_min_global + "buffer_atomic_smin", VGPR_32, i32, atomic_load_min_global_32 >; defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global + "buffer_atomic_umin", VGPR_32, i32, atomic_load_umin_global_32 >; defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_smax", VGPR_32, i32, atomic_max_global + "buffer_atomic_smax", VGPR_32, i32, atomic_load_max_global_32 >; defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global + "buffer_atomic_umax", VGPR_32, i32, atomic_load_umax_global_32 >; defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics < - "buffer_atomic_and", VGPR_32, i32, atomic_and_global + "buffer_atomic_and", VGPR_32, i32, atomic_load_and_global_32 >; defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics < - "buffer_atomic_or", VGPR_32, i32, atomic_or_global + "buffer_atomic_or", VGPR_32, i32, atomic_load_or_global_32 >; defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics < - "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global + "buffer_atomic_xor", VGPR_32, i32, atomic_load_xor_global_32 >; defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics < - "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global + "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global_32 >; defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics < - "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global + "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global_32 >; defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global + "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global_64 >; defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics < "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag >; defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global + "buffer_atomic_add_x2", VReg_64, i64, atomic_load_add_global_64 >; defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global + "buffer_atomic_sub_x2", VReg_64, i64, atomic_load_sub_global_64 >; defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global + "buffer_atomic_smin_x2", VReg_64, i64, atomic_load_min_global_64 >; defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global + "buffer_atomic_umin_x2", VReg_64, i64, atomic_load_umin_global_64 >; defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global + "buffer_atomic_smax_x2", VReg_64, i64, atomic_load_max_global_64 >; defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global + "buffer_atomic_umax_x2", VReg_64, i64, atomic_load_umax_global_64 >; defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global + "buffer_atomic_and_x2", VReg_64, i64, atomic_load_and_global_64 >; defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global + "buffer_atomic_or_x2", VReg_64, i64, atomic_load_or_global_64 >; defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global + "buffer_atomic_xor_x2", VReg_64, i64, atomic_load_xor_global_64 >; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global + "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global_64 >; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global + "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64 >; let SubtargetPredicate = isGFX8GFX9 in { @@ -981,58 +1010,75 @@ def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">; let SubtargetPredicate = isGFX6 in { // isn't on CI & VI /* defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">; -defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">; -defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin">; -defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax">; defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub_x2">; -defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap_x2">; -defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin_x2">; -defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax_x2">; */ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; } +let SubtargetPredicate = isGFX6GFX7GFX10 in { + +defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics < + "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag +>; +defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics < + "buffer_atomic_fmin", VGPR_32, f32, null_frag +>; +defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics < + "buffer_atomic_fmax", VGPR_32, f32, null_frag +>; +defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag +>; +defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_fmin_x2", VReg_64, f64, null_frag +>; +defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics < + "buffer_atomic_fmax_x2", VReg_64, f64, null_frag +>; + +} + let SubtargetPredicate = HasD16LoadStore in { defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads < - "buffer_load_ubyte_d16", VGPR_32, i32, null_frag, 1 + "buffer_load_ubyte_d16", i32, null_frag, 1 >; defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads < - "buffer_load_ubyte_d16_hi", VGPR_32, i32, null_frag, 1 + "buffer_load_ubyte_d16_hi", i32, null_frag, 1 >; defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads < - "buffer_load_sbyte_d16", VGPR_32, i32, null_frag, 1 + "buffer_load_sbyte_d16", i32, null_frag, 1 >; defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads < - "buffer_load_sbyte_d16_hi", VGPR_32, i32, null_frag, 1 + "buffer_load_sbyte_d16_hi", i32, null_frag, 1 >; defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads < - "buffer_load_short_d16", VGPR_32, i32, null_frag, 1 + "buffer_load_short_d16", i32, null_frag, 1 >; defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads < - "buffer_load_short_d16_hi", VGPR_32, i32, null_frag, 1 + "buffer_load_short_d16_hi", i32, null_frag, 1 >; defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores < - "buffer_store_byte_d16_hi", VGPR_32, i32 + "buffer_store_byte_d16_hi", i32 >; defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores < - "buffer_store_short_d16_hi", VGPR_32, i32 + "buffer_store_short_d16_hi", i32 >; defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_hi_x", VGPR_32 + "buffer_load_format_d16_hi_x", i32 >; defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < - "buffer_store_format_d16_hi_x", VGPR_32 + "buffer_store_format_d16_hi_x", i32 >; } // End HasD16LoadStore @@ -1043,10 +1089,10 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", let SubtargetPredicate = HasAtomicFaddInsts in { defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_add_f32", VGPR_32, f32, atomic_add_global + "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret >; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret >; } // End SubtargetPredicate = HasAtomicFaddInsts @@ -1055,35 +1101,35 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < // MTBUF Instructions //===----------------------------------------------------------------------===// -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32, 1>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64, 2>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96, 3>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128, 4>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32, 1>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64, 2>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96, 3>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>; - defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>; - defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>; + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128, 4>; + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64, 4>; + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>; } // End HasPackedD16VMem. let SubtargetPredicate = isGFX7Plus in { @@ -1118,6 +1164,10 @@ def extract_dlc : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8); }]>; +def extract_swz : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// @@ -1125,33 +1175,37 @@ def extract_dlc : SDNodeXForm<imm, [{ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0)), + (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + timm:$auxiliary, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0)), + (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + timm:$auxiliary, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm)), + (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm)), + (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1182,8 +1236,12 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; @@ -1196,36 +1254,40 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">; multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1256,8 +1318,12 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">; @@ -1273,32 +1339,32 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< (vt (name vt:$vdata_in, v4i32:$rsrc, 0, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0)), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm)), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm)), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (vt (name vt:$vdata_in, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0)), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm)), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm)), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN) $vdata_in, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), @@ -1316,6 +1382,8 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">; defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">; defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">; defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">; +defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">; +defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">; defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">; @@ -1326,37 +1394,39 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">; +defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">; multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< (name vt:$vdata_in, v4i32:$rsrc, 0, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name vt:$vdata_in, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) $vdata_in, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), @@ -1370,8 +1440,8 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMI def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), @@ -1382,8 +1452,8 @@ def : GCNPat< def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), @@ -1394,8 +1464,8 @@ def : GCNPat< def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, 0), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), @@ -1406,8 +1476,8 @@ def : GCNPat< def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, imm:$offset, - imm:$cachepolicy, imm), + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), @@ -1419,8 +1489,8 @@ def : GCNPat< class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, PatFrag constant_ld> : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) >; multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, @@ -1428,12 +1498,12 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins def : GCNPat < (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0) + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) >; def : GCNPat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) >; } @@ -1454,8 +1524,8 @@ multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, def : GCNPat < (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) >; } @@ -1478,12 +1548,12 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, def : GCNPat < (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; } @@ -1493,12 +1563,12 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen, ValueType vt, PatFrag ld_frag> { def : GCNPat < (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) >; def : GCNPat < (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) >; } @@ -1512,7 +1582,10 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, zextloadi16_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>; + +foreach vt = Reg32Types.types in { defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>; +} defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>; @@ -1535,16 +1608,16 @@ defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D1 multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_st> { - // Store follows atomic op convention so address is forst + // Store follows atomic op convention so address is first def : GCNPat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0) + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) >; def : GCNPat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) >; } let SubtargetPredicate = isGFX6GFX7 in { @@ -1558,8 +1631,8 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, def : GCNPat < (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)), - (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)), + (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) >; } @@ -1573,13 +1646,13 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen, def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), - (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)), - (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; } @@ -1587,7 +1660,11 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>; defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>; defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>; -defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>; + +foreach vt = Reg32Types.types in { +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, vt, store_private>; +} + defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private, VReg_64>; defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private, VReg_96>; defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>; @@ -1613,37 +1690,41 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, 0)), + (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, imm)), + (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, timm)), (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, 0)), + (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, imm)), + (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, timm)), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1671,37 +1752,41 @@ let SubtargetPredicate = HasPackedD16VMem in { multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, 0), + (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, imm), + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, timm), (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$format, imm:$cachepolicy, 0), + (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + timm:$format, timm:$auxiliary, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, - imm:$offset, imm:$format, imm:$cachepolicy, imm), + timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1957,10 +2042,9 @@ defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03a>; defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03b>; defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03c>; defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03d>; -// FIXME-GFX6-GFX7-GFX10: Add following instructions: -//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>; -//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>; -//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>; +defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>; +defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>; +defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>; defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x050>; defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x051>; defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x052>; @@ -1975,10 +2059,9 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05b>; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>; // FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7. -// FIXME-GFX6-GFX7-GFX10: Add following instructions: -//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>; -//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; -//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; +defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>; +defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; +defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>; defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>; @@ -2353,7 +2436,7 @@ let SubtargetPredicate = HasPackedD16VMem in { def MUBUFInfoTable : GenericTable { let FilterClass = "MUBUF_Pseudo"; let CppTypeName = "MUBUFInfo"; - let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"]; + let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"]; let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getMUBUFOpcodeHelper"; @@ -2364,7 +2447,26 @@ def getMUBUFInfoFromOpcode : SearchIndex { let Key = ["Opcode"]; } -def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex { +def getMUBUFInfoFromBaseOpcodeAndElements : SearchIndex { let Table = MUBUFInfoTable; - let Key = ["BaseOpcode", "dwords"]; + let Key = ["BaseOpcode", "elements"]; +} + +def MTBUFInfoTable : GenericTable { + let FilterClass = "MTBUF_Pseudo"; + let CppTypeName = "MTBUFInfo"; + let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getMTBUFOpcodeHelper"; +} + +def getMTBUFInfoFromOpcode : SearchIndex { + let Table = MTBUFInfoTable; + let Key = ["Opcode"]; +} + +def getMTBUFInfoFromBaseOpcodeAndElements : SearchIndex { + let Table = MTBUFInfoTable; + let Key = ["BaseOpcode", "elements"]; } diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index c52eaaa3fdc5..816ec14a0e98 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -81,6 +81,17 @@ class DS_Real <DS_Pseudo ds> : // DS Pseudo instructions +class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32> +: DS_Pseudo<opName, + (outs), + (ins rc:$data0, offset:$offset, gds:$gds), + "$data0$offset$gds"> { + + let has_addr = 0; + let has_data1 = 0; + let has_vdst = 0; +} + class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs), @@ -317,13 +328,16 @@ class DS_GWS <string opName, dag ins, string asmOps> class DS_GWS_0D <string opName> : DS_GWS<opName, - (ins offset:$offset, gds:$gds), "$offset gds">; + (ins offset:$offset, gds:$gds), "$offset gds"> { + let hasSideEffects = 1; +} class DS_GWS_1D <string opName> : DS_GWS<opName, (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> { let has_gws_data0 = 1; + let hasSideEffects = 1; } class DS_VOID <string opName> : DS_Pseudo<opName, @@ -391,11 +405,12 @@ def DS_WRITE_B8_D16_HI : DS_1A1D_NORET<"ds_write_b8_d16_hi">; def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">; } +} // End has_m0_read = 0 + let SubtargetPredicate = HasDSAddTid in { -def DS_WRITE_ADDTID_B32 : DS_1A1D_NORET<"ds_write_addtid_b32">; +def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">; } -} // End has_m0_read = 0 } // End mayLoad = 0 defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">; @@ -540,13 +555,14 @@ def DS_READ_I8_D16_HI : DS_1A_RET_Tied<"ds_read_i8_d16_hi">; def DS_READ_U16_D16 : DS_1A_RET_Tied<"ds_read_u16_d16">; def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">; } +} // End has_m0_read = 0 let SubtargetPredicate = HasDSAddTid in { -def DS_READ_ADDTID_B32 : DS_1A_RET<"ds_read_addtid_b32">; -} -} // End has_m0_read = 0 +def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">; } +} // End mayStore = 0 + def DS_CONSUME : DS_0A_RET<"ds_consume">; def DS_APPEND : DS_0A_RET<"ds_append">; def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; @@ -600,13 +616,13 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; //===----------------------------------------------------------------------===// def : GCNPat < - (int_amdgcn_ds_swizzle i32:$src, imm:$offset16), + (int_amdgcn_ds_swizzle i32:$src, timm:$offset16), (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) >; class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < - (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst $ptr, (as_i16imm $offset), (i1 gds)) + (vt (frag (DS1Addr1Offset i32:$ptr, i16:$offset))), + (inst $ptr, offset:$offset, (i1 gds)) >; multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { @@ -621,8 +637,8 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { } class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in), - (inst $ptr, (as_i16imm $offset), (i1 0), $in) + (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$in), + (inst $ptr, offset:$offset, (i1 0), $in) >; defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">; @@ -636,13 +652,20 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">; -defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">; + +foreach vt = Reg32Types.types in { +defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">; +} + defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">; defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">; let AddedComplexity = 100 in { -defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">; +foreach vt = VReg_64.RegTypes in { +defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">; +} + defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">; } // End AddedComplexity = 100 @@ -664,8 +687,8 @@ def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>; } class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < - (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) + (frag vt:$value, (DS1Addr1Offset i32:$ptr, i16:$offset)), + (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds)) >; multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { @@ -681,8 +704,8 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { // Irritatingly, atomic_store reverses the order of operands from a // normal store. class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) + (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), + (inst $ptr, $value, offset:$offset, (i1 0)) >; multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { @@ -699,9 +722,13 @@ defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">; defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">; defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">; defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">; -defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">; -defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">; -defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">; + +foreach vt = VGPR_32.RegTypes in { +defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">; +} + +defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">; +defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">; let OtherPredicates = [D16PreservesUnusedBits] in { def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>; @@ -736,46 +763,49 @@ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>; let AddedComplexity = 100 in { -defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">; +foreach vt = VReg_64.RegTypes in { +defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">; +} + defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">; } // End AddedComplexity = 100 class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) + (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), + (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds)) >; multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0")>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local")>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; } - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>; } class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 gds)) + (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), + (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds)) >; multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0")>; + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local")>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; } - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>; + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>; } diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 4ec4be9bc485..ec2e2c4e8b71 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1095,6 +1095,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { case 106: return createRegOperand(VCC); case 108: return createRegOperand(TBA); case 110: return createRegOperand(TMA); + case 125: return createRegOperand(SGPR_NULL); case 126: return createRegOperand(EXEC); case 235: return createRegOperand(SRC_SHARED_BASE); case 236: return createRegOperand(SRC_SHARED_LIMIT); @@ -1172,7 +1173,8 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { int TTmpIdx = getTTmpIdx(Val); if (TTmpIdx >= 0) { - return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx); + auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32); + return createSRegOperand(TTmpClsId, TTmpIdx); } else if (Val > SGPR_MAX) { return IsWave64 ? decodeSpecialReg64(Val) : decodeSpecialReg32(Val); diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td index 0550092ce1d6..792e26d21f98 100644 --- a/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/lib/Target/AMDGPU/EvergreenInstructions.td @@ -322,46 +322,46 @@ def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$ defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_RTN, RAT_ATOMIC_XCHG_INT_NORET, - atomic_swap_global_ret, - atomic_swap_global_noret>; + atomic_swap_global_ret_32, + atomic_swap_global_noret_32>; defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_RTN, RAT_ATOMIC_ADD_NORET, - atomic_add_global_ret, atomic_add_global_noret>; + atomic_load_add_global_ret_32, atomic_load_add_global_noret_32>; defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_RTN, RAT_ATOMIC_SUB_NORET, - atomic_sub_global_ret, atomic_sub_global_noret>; + atomic_load_sub_global_ret_32, atomic_load_sub_global_noret_32>; defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_RTN, RAT_ATOMIC_MIN_INT_NORET, - atomic_min_global_ret, atomic_min_global_noret>; + atomic_load_min_global_ret_32, atomic_load_min_global_noret_32>; defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_RTN, RAT_ATOMIC_MIN_UINT_NORET, - atomic_umin_global_ret, atomic_umin_global_noret>; + atomic_load_umin_global_ret_32, atomic_load_umin_global_noret_32>; defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_RTN, RAT_ATOMIC_MAX_INT_NORET, - atomic_max_global_ret, atomic_max_global_noret>; + atomic_load_max_global_ret_32, atomic_load_max_global_noret_32>; defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_RTN, RAT_ATOMIC_MAX_UINT_NORET, - atomic_umax_global_ret, atomic_umax_global_noret>; + atomic_load_umax_global_ret_32, atomic_load_umax_global_noret_32>; defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_RTN, RAT_ATOMIC_AND_NORET, - atomic_and_global_ret, atomic_and_global_noret>; + atomic_load_and_global_ret_32, atomic_load_and_global_noret_32>; defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_RTN, RAT_ATOMIC_OR_NORET, - atomic_or_global_ret, atomic_or_global_noret>; + atomic_load_or_global_ret_32, atomic_load_or_global_noret_32>; defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_RTN, RAT_ATOMIC_XOR_NORET, - atomic_xor_global_ret, atomic_xor_global_noret>; + atomic_load_xor_global_ret_32, atomic_load_xor_global_noret_32>; defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN, RAT_ATOMIC_INC_UINT_NORET, - atomic_add_global_ret, - atomic_add_global_noret, 1>; + atomic_load_add_global_ret_32, + atomic_load_add_global_noret_32, 1>; defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN, RAT_ATOMIC_INC_UINT_NORET, - atomic_sub_global_ret, - atomic_sub_global_noret, -1>; + atomic_load_sub_global_ret_32, + atomic_load_sub_global_noret_32, -1>; defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN, RAT_ATOMIC_DEC_UINT_NORET, - atomic_add_global_ret, - atomic_add_global_noret, -1>; + atomic_load_add_global_ret_32, + atomic_load_add_global_noret_32, -1>; defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN, RAT_ATOMIC_DEC_UINT_NORET, - atomic_sub_global_ret, - atomic_sub_global_noret, 1>; + atomic_load_sub_global_ret_32, + atomic_load_sub_global_noret_32, 1>; // Should be predicated on FeatureFP64 // def FMA_64 : R600_3OP < @@ -628,37 +628,37 @@ def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE", [(truncstorei16_local i32:$src1, i32:$src0)] >; def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", - [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_add_local_32 i32:$src0, i32:$src1))] >; def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", - [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_sub_local_32 i32:$src0, i32:$src1))] >; def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", - [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_and_local_32 i32:$src0, i32:$src1))] >; def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", - [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_or_local_32 i32:$src0, i32:$src1))] >; def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", - [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_xor_local_32 i32:$src0, i32:$src1))] >; def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", - [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_min_local_32 i32:$src0, i32:$src1))] >; def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", - [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_max_local_32 i32:$src0, i32:$src1))] >; def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", - [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_umin_local_32 i32:$src0, i32:$src1))] >; def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", - [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_load_umax_local_32 i32:$src0, i32:$src1))] >; def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", - [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))] + [(set i32:$dst, (atomic_swap_local_32 i32:$src0, i32:$src1))] >; def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", - [(set i32:$dst, (atomic_cmp_swap_local i32:$src0, i32:$src1, i32:$src2))] + [(set i32:$dst, (atomic_cmp_swap_local_32 i32:$src0, i32:$src1, i32:$src2))] >; def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", [(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))] diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index 889f60dae920..80ee17eba141 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -270,7 +270,7 @@ multiclass FLAT_Atomic_Pseudo< SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = getIsFP<data_vt>.ret> { + bit isFP = isFloatType<data_vt>.ret> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), (ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc), @@ -300,7 +300,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = getIsFP<data_vt>.ret> { + bit isFP = isFloatType<data_vt>.ret> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), @@ -333,7 +333,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = getIsFP<data_vt>.ret> { + bit isFP = isFloatType<data_vt>.ret> { def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_rc:$vdst), @@ -564,76 +564,76 @@ defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswa v2i64, VReg_128>; defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap", - VGPR_32, i32, atomic_swap_global>; + VGPR_32, i32, atomic_swap_global_32>; defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2", - VReg_64, i64, atomic_swap_global>; + VReg_64, i64, atomic_swap_global_64>; defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add", - VGPR_32, i32, atomic_add_global>; + VGPR_32, i32, atomic_load_add_global_32>; defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub", - VGPR_32, i32, atomic_sub_global>; + VGPR_32, i32, atomic_load_sub_global_32>; defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin", - VGPR_32, i32, atomic_min_global>; + VGPR_32, i32, atomic_load_min_global_32>; defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin", - VGPR_32, i32, atomic_umin_global>; + VGPR_32, i32, atomic_load_umin_global_32>; defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax", - VGPR_32, i32, atomic_max_global>; + VGPR_32, i32, atomic_load_max_global_32>; defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax", - VGPR_32, i32, atomic_umax_global>; + VGPR_32, i32, atomic_load_umax_global_32>; defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and", - VGPR_32, i32, atomic_and_global>; + VGPR_32, i32, atomic_load_and_global_32>; defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or", - VGPR_32, i32, atomic_or_global>; + VGPR_32, i32, atomic_load_or_global_32>; defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor", - VGPR_32, i32, atomic_xor_global>; + VGPR_32, i32, atomic_load_xor_global_32>; defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc", - VGPR_32, i32, atomic_inc_global>; + VGPR_32, i32, atomic_inc_global_32>; defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec", - VGPR_32, i32, atomic_dec_global>; + VGPR_32, i32, atomic_dec_global_32>; defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2", - VReg_64, i64, atomic_add_global>; + VReg_64, i64, atomic_load_add_global_64>; defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2", - VReg_64, i64, atomic_sub_global>; + VReg_64, i64, atomic_load_sub_global_64>; defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2", - VReg_64, i64, atomic_min_global>; + VReg_64, i64, atomic_load_min_global_64>; defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2", - VReg_64, i64, atomic_umin_global>; + VReg_64, i64, atomic_load_umin_global_64>; defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2", - VReg_64, i64, atomic_max_global>; + VReg_64, i64, atomic_load_max_global_64>; defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2", - VReg_64, i64, atomic_umax_global>; + VReg_64, i64, atomic_load_umax_global_64>; defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2", - VReg_64, i64, atomic_and_global>; + VReg_64, i64, atomic_load_and_global_64>; defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2", - VReg_64, i64, atomic_or_global>; + VReg_64, i64, atomic_load_or_global_64>; defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2", - VReg_64, i64, atomic_xor_global>; + VReg_64, i64, atomic_load_xor_global_64>; defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2", - VReg_64, i64, atomic_inc_global>; + VReg_64, i64, atomic_inc_global_64>; defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2", - VReg_64, i64, atomic_dec_global>; + VReg_64, i64, atomic_dec_global_64>; } // End is_flat_global = 1 } // End SubtargetPredicate = HasFlatGlobalInsts @@ -686,10 +686,10 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in { let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in { defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_add_f32", VGPR_32, f32, atomic_add_global + "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret >; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global + "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret >; } // End SubtargetPredicate = HasAtomicFaddInsts @@ -777,8 +777,6 @@ def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>; def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>; def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>; def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>; def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>; @@ -787,41 +785,50 @@ def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>; def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>; def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>; -def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32, VReg_64>; + +foreach vt = Reg32Types.types in { +def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>; +def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>; +} + +foreach vt = VReg_64.RegTypes in { +def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt, VReg_64>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>; +} + def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32, VReg_96>; def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32, VReg_128>; def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>; def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64, VReg_64>; -def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>; def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; - -def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>; + +def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>; def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>; def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; @@ -847,9 +854,6 @@ def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } // End OtherPredicates = [HasFlatAddressSpace] -def atomic_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_fadd>; -def atomic_pk_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_pk_fadd>; - let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in { def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>; @@ -863,8 +867,16 @@ def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>; -def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>; -def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>; +foreach vt = Reg32Types.types in { +def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, vt>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, vt, VGPR_32>; +} + +foreach vt = VReg_64.RegTypes in { +def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, vt>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, vt, VReg_64>; +} + def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>; def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>; @@ -875,8 +887,6 @@ def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32, VGPR_32>; def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16, VGPR_32>; def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32, VGPR_32>; def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16, VGPR_32>; -def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32, VGPR_32>; -def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32, VReg_64>; def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32, VReg_96>; def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32, VReg_128>; @@ -902,36 +912,36 @@ def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>; def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>; def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64, VReg_64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_add_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_sub_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_and_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_max_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_min_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_or_global, i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_xor_global, i32>; - -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_and_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_or_global, i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>; + +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>; -def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>; -def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global, f32>; -def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global, v2f16>; +def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global_noret, f32>; +def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global_noret, v2f16>; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 @@ -1174,7 +1184,7 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> : let AssemblerPredicate = isGFX10Plus; let DecoderNamespace = "GFX10"; - let Inst{11-0} = {offset{12}, offset{10-0}}; + let Inst{11-0} = offset{11-0}; let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue); let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d); let Inst{55} = 0; diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp index e1845e2e8e87..98678873e37c 100644 --- a/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -41,6 +41,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -155,8 +156,6 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, RegSubRegPair CombOldVGPR, bool CombBCZ) const { assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); - assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == - TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); auto OrigOp = OrigMI.getOpcode(); auto DPPOp = getDPPOp(OrigOp); @@ -178,7 +177,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, if (OldIdx != -1) { assert(OldIdx == NumOperands); assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)); - DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg); + auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI); + DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef, + CombOldVGPR.SubReg); ++NumOperands; } else { // TODO: this discards MAC/FMA instructions for now, let's add it later @@ -195,6 +196,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); DPPInst.addImm(Mod0->getImm()); ++NumOperands; + } else if (AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src0_modifiers) != -1) { + DPPInst.addImm(0); + ++NumOperands; } auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); assert(Src0); @@ -214,6 +219,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); DPPInst.addImm(Mod1->getImm()); ++NumOperands; + } else if (AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src1_modifiers) != -1) { + DPPInst.addImm(0); + ++NumOperands; } if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { @@ -344,6 +353,10 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); assert(DstOpnd && DstOpnd->isReg()); auto DPPMovReg = DstOpnd->getReg(); + if (DPPMovReg.isPhysical()) { + LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n"); + return false; + } if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) { LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" " for all uses\n"); @@ -362,7 +375,13 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { bool BoundCtrlZero = BCZOpnd->getImm(); auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); + auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); assert(OldOpnd && OldOpnd->isReg()); + assert(SrcOpnd && SrcOpnd->isReg()); + if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) { + LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n"); + return false; + } auto * const OldOpndValue = getOldOpndValue(*OldOpnd); // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else @@ -408,6 +427,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { dbgs() << ", bound_ctrl=" << CombBCZ << '\n'); SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs; + DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos; auto CombOldVGPR = getRegSubRegPair(*OldOpnd); // try to reuse previous old reg if its undefined (IMPLICIT_DEF) if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef @@ -420,13 +440,49 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { OrigMIs.push_back(&MovMI); bool Rollback = true; + SmallVector<MachineOperand*, 16> Uses; + for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) { + Uses.push_back(&Use); + } + + while (!Uses.empty()) { + MachineOperand *Use = Uses.pop_back_val(); Rollback = true; - auto &OrigMI = *Use.getParent(); + auto &OrigMI = *Use->getParent(); LLVM_DEBUG(dbgs() << " try: " << OrigMI); auto OrigOp = OrigMI.getOpcode(); + if (OrigOp == AMDGPU::REG_SEQUENCE) { + Register FwdReg = OrigMI.getOperand(0).getReg(); + unsigned FwdSubReg = 0; + + if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) { + LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" + " for all uses\n"); + break; + } + + unsigned OpNo, E = OrigMI.getNumOperands(); + for (OpNo = 1; OpNo < E; OpNo += 2) { + if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) { + FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm(); + break; + } + } + + if (!FwdSubReg) + break; + + for (auto &Op : MRI->use_nodbg_operands(FwdReg)) { + if (Op.getSubReg() == FwdSubReg) + Uses.push_back(&Op); + } + RegSeqWithOpNos[&OrigMI].push_back(OpNo); + continue; + } + if (TII->isVOP3(OrigOp)) { if (!TII->hasVALU32BitEncoding(OrigOp)) { LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n"); @@ -447,14 +503,14 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { } LLVM_DEBUG(dbgs() << " combining: " << OrigMI); - if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { + if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ)) { DPPMIs.push_back(DPPInst); Rollback = false; } } else if (OrigMI.isCommutable() && - &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { auto *BB = OrigMI.getParent(); auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); BB->insert(OrigMI, NewMI); @@ -475,9 +531,22 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { OrigMIs.push_back(&OrigMI); } + Rollback |= !Uses.empty(); + for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) MI->eraseFromParent(); + if (!Rollback) { + for (auto &S : RegSeqWithOpNos) { + if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) { + S.first->eraseFromParent(); + continue; + } + while (!S.second.empty()) + S.first->getOperand(S.second.pop_back_val()).setIsUndef(true); + } + } + return !Rollback; } @@ -498,6 +567,13 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) { Changed = true; ++NumDPPMovsCombined; + } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) { + auto Split = TII->expandMovDPP64(MI); + for (auto M : { Split.first, Split.second }) { + if (combineDPPMov(*M)) + ++NumDPPMovsCombined; + } + Changed = true; } } } diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 885239e2faed..9528aee4c50e 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -726,7 +726,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, if (!TRI->isVGPR(MRI, Def.getReg())) return WaitStatesNeeded; - unsigned Reg = Def.getReg(); + Register Reg = Def.getReg(); auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { int DataIdx = createsVALUHazard(*MI); return DataIdx >= 0 && @@ -792,7 +792,7 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) return 0; - unsigned LaneSelectReg = LaneSelectOp->getReg(); + Register LaneSelectReg = LaneSelectOp->getReg(); auto IsHazardFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; @@ -891,7 +891,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* // which is always a VGPR and available. auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); - unsigned Reg = Src0->getReg(); + Register Reg = Src0->getReg(); bool IsUndef = Src0->isUndef(); BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32)) @@ -952,6 +952,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { unsigned SDSTName; switch (MI->getOpcode()) { case AMDGPU::V_READLANE_B32: + case AMDGPU::V_READLANE_B32_gfx10: case AMDGPU::V_READFIRSTLANE_B32: SDSTName = AMDGPU::OpName::vdst; break; @@ -976,7 +977,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { if (!SDST) return false; - const unsigned SDSTReg = SDST->getReg(); + const Register SDSTReg = SDST->getReg(); auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); }; @@ -1251,14 +1252,14 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; const int MaxWaitStates = 18; - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); unsigned HazardDefLatency = 0; auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] (MachineInstr *MI) { if (!IsMFMAFn(MI)) return false; - unsigned DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI->getOperand(0).getReg(); if (DstReg == Reg) return false; HazardDefLatency = std::max(HazardDefLatency, @@ -1304,7 +1305,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) return false; - unsigned DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI->getOperand(0).getReg(); return TRI.regsOverlap(Reg, DstReg); }; @@ -1330,14 +1331,14 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; const int MaxWaitStates = 13; - unsigned DstReg = MI->getOperand(0).getReg(); + Register DstReg = MI->getOperand(0).getReg(); unsigned HazardDefLatency = 0; auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] (MachineInstr *MI) { if (!IsMFMAFn(MI)) return false; - unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); HazardDefLatency = std::max(HazardDefLatency, TSchedModel.computeInstrLatency(MI)); return TRI.regsOverlap(Reg, DstReg); @@ -1376,7 +1377,7 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) continue; - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); const int AccVgprReadLdStWaitStates = 2; const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1; diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp index 1eb617640c32..39072af7d871 100644 --- a/lib/Target/AMDGPU/GCNILPSched.cpp +++ b/lib/Target/AMDGPU/GCNILPSched.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Debug.h" using namespace llvm; diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 3525174223bd..90ab6a14ce20 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -237,7 +237,7 @@ public: GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C, StrategyKind S) - : BaseClass(C, llvm::make_unique<SchedStrategyStub>()) + : BaseClass(C, std::make_unique<SchedStrategyStub>()) , Context(C) , Strategy(S) , UPTracker(*LIS) { diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp index 51c4c99cfb18..36a8f74150f5 100644 --- a/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -173,11 +173,11 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { bool NSA = false; for (unsigned I = 0; I < Info->VAddrDwords; ++I) { const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); - unsigned Reg = Op.getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + Register Reg = Op.getReg(); + if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) return NSA_Status::FIXED; - unsigned PhysReg = VRM->getPhys(Reg); + Register PhysReg = VRM->getPhys(Reg); if (!Fast) { if (!PhysReg) @@ -276,7 +276,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { SlotIndex MinInd, MaxInd; for (unsigned I = 0; I < Info->VAddrDwords; ++I) { const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); - unsigned Reg = Op.getReg(); + Register Reg = Op.getReg(); LiveInterval *LI = &LIS->getInterval(Reg); if (llvm::find(Intervals, LI) != Intervals.end()) { // Same register used, unable to make sequential diff --git a/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/lib/Target/AMDGPU/GCNRegBankReassign.cpp index f0d47eaa4ed1..2927d4eb745a 100644 --- a/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ b/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -230,7 +230,7 @@ private: public: Printable printReg(unsigned Reg, unsigned SubReg = 0) const { return Printable([Reg, SubReg, this](raw_ostream &OS) { - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + if (Register::isPhysicalRegister(Reg)) { OS << llvm::printReg(Reg, TRI); return; } @@ -275,7 +275,7 @@ char GCNRegBankReassign::ID = 0; char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { - assert (TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(Register::isPhysicalRegister(Reg)); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); unsigned Size = TRI->getRegSizeInBits(*RC); @@ -293,7 +293,7 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, int Bank) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { if (!VRM->isAssignedReg(Reg)) return 0; @@ -364,7 +364,7 @@ unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, if (!Op.isReg() || Op.isUndef()) continue; - unsigned R = Op.getReg(); + Register R = Op.getReg(); if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) continue; @@ -420,12 +420,12 @@ unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, } bool GCNRegBankReassign::isReassignable(unsigned Reg) const { - if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) return false; const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); - unsigned PhysReg = VRM->getPhys(Reg); + Register PhysReg = VRM->getPhys(Reg); if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) return false; @@ -654,7 +654,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) { } std::sort(BankStalls.begin(), BankStalls.end()); - unsigned OrigReg = VRM->getPhys(C.Reg); + Register OrigReg = VRM->getPhys(C.Reg); LRM->unassign(LI); while (!BankStalls.empty()) { BankStall BS = BankStalls.pop_back_val(); diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 39460fbd8a84..d593204cba05 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -40,7 +40,7 @@ void llvm::printLivesAt(SlotIndex SI, << *LIS.getInstructionFromIndex(SI); unsigned Num = 0; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - const unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + const unsigned Reg = Register::index2VirtReg(I); if (!LIS.hasInterval(Reg)) continue; const auto &LI = LIS.getInterval(Reg); @@ -84,7 +84,7 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1, unsigned GCNRegPressure::getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) { - assert(TargetRegisterInfo::isVirtualRegister(Reg)); + assert(Register::isVirtualRegister(Reg)); const auto RC = MRI.getRegClass(Reg); auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); return STI->isSGPRClass(RC) ? @@ -183,7 +183,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST, #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const { - OS << "VGPRs: " << getVGPRNum(); + OS << "VGPRs: " << Value[VGPR32] << ' '; + OS << "AGPRs: " << Value[AGPR32]; if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')'; OS << ", SGPRs: " << getSGPRNum(); if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')'; @@ -196,8 +197,7 @@ void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const { static LaneBitmask getDefRegMask(const MachineOperand &MO, const MachineRegisterInfo &MRI) { - assert(MO.isDef() && MO.isReg() && - TargetRegisterInfo::isVirtualRegister(MO.getReg())); + assert(MO.isDef() && MO.isReg() && Register::isVirtualRegister(MO.getReg())); // We don't rely on read-undef flag because in case of tentative schedule // tracking it isn't set correctly yet. This works correctly however since @@ -210,8 +210,7 @@ static LaneBitmask getDefRegMask(const MachineOperand &MO, static LaneBitmask getUsedRegMask(const MachineOperand &MO, const MachineRegisterInfo &MRI, const LiveIntervals &LIS) { - assert(MO.isUse() && MO.isReg() && - TargetRegisterInfo::isVirtualRegister(MO.getReg())); + assert(MO.isUse() && MO.isReg() && Register::isVirtualRegister(MO.getReg())); if (auto SubReg = MO.getSubReg()) return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); @@ -232,7 +231,7 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI) { SmallVector<RegisterMaskPair, 8> Res; for (const auto &MO : MI.operands()) { - if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg())) continue; if (!MO.isUse() || !MO.readsReg()) continue; @@ -278,7 +277,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, const MachineRegisterInfo &MRI) { GCNRPTracker::LiveRegSet LiveRegs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - auto Reg = TargetRegisterInfo::index2VirtReg(I); + auto Reg = Register::index2VirtReg(I); if (!LIS.hasInterval(Reg)) continue; auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); @@ -329,8 +328,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { MaxPressure = max(AtMIPressure, MaxPressure); for (const auto &MO : MI.defs()) { - if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) || - MO.isDead()) + if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()) || MO.isDead()) continue; auto Reg = MO.getReg(); @@ -408,8 +406,8 @@ void GCNDownwardRPTracker::advanceToNext() { for (const auto &MO : LastTrackedMI->defs()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + Register Reg = MO.getReg(); + if (!Register::isVirtualRegister(Reg)) continue; auto &LiveMask = LiveRegs[Reg]; auto PrevMask = LiveMask; @@ -500,7 +498,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, const MachineRegisterInfo &MRI) { const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + unsigned Reg = Register::index2VirtReg(I); auto It = LiveRegs.find(Reg); if (It != LiveRegs.end() && It->second.any()) OS << ' ' << printVRegOrUnit(Reg, TRI) << ':' diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h index e4894418b943..5862cdb04166 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.h +++ b/lib/Target/AMDGPU/GCNRegPressure.h @@ -214,7 +214,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap; SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - auto Reg = TargetRegisterInfo::index2VirtReg(I); + auto Reg = Register::index2VirtReg(I); if (!LIS.hasInterval(Reg)) continue; auto &LI = LIS.getInterval(Reg); diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 4ea990ae490e..973491a70d3c 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -71,8 +71,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // the tracker, so we need to pass those function a non-const copy. RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); - std::vector<unsigned> Pressure; - std::vector<unsigned> MaxPressure; + Pressure.clear(); + MaxPressure.clear(); if (AtTop) TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); @@ -103,10 +103,10 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // the analysis to look through dependencies to find the path with the least // register pressure. - // We only need to update the RPDelata for instructions that increase - // register pressure. Instructions that decrease or keep reg pressure - // the same will be marked as RegExcess in tryCandidate() when they - // are compared with instructions that increase the register pressure. + // We only need to update the RPDelta for instructions that increase register + // pressure. Instructions that decrease or keep reg pressure the same will be + // marked as RegExcess in tryCandidate() when they are compared with + // instructions that increase the register pressure. if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet()); Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); @@ -160,6 +160,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(Zone.DAG, SchedModel); Cand.setBest(TryCand); + LLVM_DEBUG(traceCandidate(Cand)); } } } @@ -195,6 +196,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { assert(BotCand.Reason != NoCand && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(BotCand)); +#ifndef NDEBUG + if (VerifyScheduling) { + SchedCandidate TCand; + TCand.reset(CandPolicy()); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); + assert(TCand.SU == BotCand.SU && + "Last pick result should correspond to re-picking right now"); + } +#endif } // Check if the top Q has a better candidate. @@ -206,6 +216,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { assert(TopCand.Reason != NoCand && "failed to find the first candidate"); } else { LLVM_DEBUG(traceCandidate(TopCand)); +#ifndef NDEBUG + if (VerifyScheduling) { + SchedCandidate TCand; + TCand.reset(CandPolicy()); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); + assert(TCand.SU == TopCand.SU && + "Last pick result should correspond to re-picking right now"); + } +#endif } // Pick best from BotCand and TopCand. diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h index eaf3dee9ba5d..dd687a930c79 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -40,6 +40,9 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler { const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure); + std::vector<unsigned> Pressure; + std::vector<unsigned> MaxPressure; + unsigned SGPRExcessLimit; unsigned VGPRExcessLimit; unsigned SGPRCriticalLimit; diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 57c0ba26cc3a..1f94ab799122 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -109,7 +109,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, MCContext *Ctx) { int64_t SignedValue = static_cast<int64_t>(Value); - switch (static_cast<unsigned>(Fixup.getKind())) { + switch (Fixup.getTargetKind()) { case AMDGPU::fixup_si_sopp_br: { int64_t BrImm = (SignedValue - 4) / 4; diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 6549a8d7d592..d352219a7a98 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -87,7 +87,7 @@ std::unique_ptr<MCObjectTargetWriter> llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend, uint8_t ABIVersion) { - return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI, + return std::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI, HasRelocationAddend, ABIVersion); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 01b53432cbb7..a9888e6ed924 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -196,6 +196,10 @@ void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "slc"); } +void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { +} + void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "tfe"); @@ -292,35 +296,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } #endif - unsigned AltName = AMDGPU::Reg32; - - if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg64; - else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg128; - else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SReg_96RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg96; - else if (MRI.getRegClass(AMDGPU::VReg_160RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SReg_160RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg160; - else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg256; - else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg512; - else if (MRI.getRegClass(AMDGPU::VReg_1024RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::SReg_1024RegClassID).contains(RegNo) || - MRI.getRegClass(AMDGPU::AReg_1024RegClassID).contains(RegNo)) - AltName = AMDGPU::Reg1024; - - O << getRegisterName(RegNo, AltName); + O << getRegisterName(RegNo); } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, @@ -623,9 +599,11 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10: case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10: + case AMDGPU::V_CNDMASK_B32_dpp_gfx10: case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10: case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10: + case AMDGPU::V_CNDMASK_B32_dpp8_gfx10: case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10: case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10: @@ -689,6 +667,7 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, switch (MI->getOpcode()) { default: break; + case AMDGPU::V_CNDMASK_B32_sdwa_gfx10: case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10: case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10: diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index b544d1ef3605..66b70831ff9e 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -12,7 +12,6 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H -#include "AMDGPUMCTargetDesc.h" #include "llvm/MC/MCInstPrinter.h" namespace llvm { @@ -26,8 +25,7 @@ public: //Autogenerated by tblgen void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo, - unsigned AltIdx = AMDGPU::NoRegAltName); + static const char *getRegisterName(unsigned RegNo); void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, const MCSubtargetInfo &STI) override; @@ -74,6 +72,8 @@ private: raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 8f11433476f4..c15da8075a34 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -250,7 +250,7 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( bool AMDGPUTargetAsmStreamer::EmitCodeEnd() { const uint32_t Encoded_s_code_end = 0xbf9f0000; OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n'; - OS << "\t.fill 32, 4, " << Encoded_s_code_end << '\n'; + OS << "\t.fill 48, 4, " << Encoded_s_code_end << '\n'; return true; } @@ -602,7 +602,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() { MCStreamer &OS = getStreamer(); OS.PushSection(); OS.EmitValueToAlignment(64, Encoded_s_code_end, 4); - for (unsigned I = 0; I < 32; ++I) + for (unsigned I = 0; I < 48; ++I) OS.EmitIntValue(Encoded_s_code_end, 4); OS.PopSection(); return true; diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 4735e6cb2446..f33ad950d5d9 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -26,7 +26,7 @@ def MIMGEncoding : GenericEnum { // Represent an ISA-level opcode, independent of the encoding and the // vdata/vaddr size. -class MIMGBaseOpcode { +class MIMGBaseOpcode : PredicateControl { MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(NAME); bit Store = 0; bit Atomic = 0; @@ -291,7 +291,7 @@ multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm, multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0, bit isResInfo = 0> { - def "" : MIMGBaseOpcode, PredicateControl { + def "" : MIMGBaseOpcode { let Coordinates = !if(isResInfo, 0, 1); let LodOrClampOrMip = mip; let HasD16 = has_d16; diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp index 3fb18862fca8..b29cd75f75cf 100644 --- a/lib/Target/AMDGPU/R600AsmPrinter.cpp +++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp @@ -104,7 +104,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) { // Functions needs to be cacheline (256B) aligned. - MF.ensureAlignment(8); + MF.ensureAlignment(Align(256)); SetupMachineFunction(MF); diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 8098b81d1ea2..e4160ac11c86 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -303,7 +303,7 @@ private: if (!MO.isReg()) continue; if (MO.isDef()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (R600::R600_Reg128RegClass.contains(Reg)) DstMI = Reg; else @@ -312,7 +312,7 @@ private: &R600::R600_Reg128RegClass); } if (MO.isUse()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (R600::R600_Reg128RegClass.contains(Reg)) SrcMI = Reg; else diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index c6e8a060d8a0..fd75c41040e1 100644 --- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -135,7 +135,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { const R600RegisterInfo &TRI = TII->getRegisterInfo(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; for (unsigned Chan = 0; Chan < 4; ++Chan) { @@ -155,12 +155,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { unsigned Opcode = BMI->getOpcode(); // While not strictly necessary from hw point of view, we force // all src operands of a dot4 inst to belong to the same slot. - unsigned Src0 = BMI->getOperand( - TII->getOperandIdx(Opcode, R600::OpName::src0)) - .getReg(); - unsigned Src1 = BMI->getOperand( - TII->getOperandIdx(Opcode, R600::OpName::src1)) - .getReg(); + Register Src0 = + BMI->getOperand(TII->getOperandIdx(Opcode, R600::OpName::src0)) + .getReg(); + Register Src1 = + BMI->getOperand(TII->getOperandIdx(Opcode, R600::OpName::src1)) + .getReg(); (void) Src0; (void) Src1; if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && @@ -205,10 +205,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { // T0_Z = CUBE T1_X, T1_Z // T0_W = CUBE T1_Y, T1_Z for (unsigned Chan = 0; Chan < 4; Chan++) { - unsigned DstReg = MI.getOperand( - TII->getOperandIdx(MI, R600::OpName::dst)).getReg(); - unsigned Src0 = MI.getOperand( - TII->getOperandIdx(MI, R600::OpName::src0)).getReg(); + Register DstReg = + MI.getOperand(TII->getOperandIdx(MI, R600::OpName::dst)).getReg(); + Register Src0 = + MI.getOperand(TII->getOperandIdx(MI, R600::OpName::src0)).getReg(); unsigned Src1 = 0; // Determine the correct source registers diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h index 950e238f4979..283e4d1935ea 100644 --- a/lib/Target/AMDGPU/R600FrameLowering.h +++ b/lib/Target/AMDGPU/R600FrameLowering.h @@ -15,9 +15,9 @@ namespace llvm { class R600FrameLowering : public AMDGPUFrameLowering { public: - R600FrameLowering(StackDirection D, unsigned StackAl, int LAO, - unsigned TransAl = 1) : - AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} + R600FrameLowering(StackDirection D, Align StackAl, int LAO, + Align TransAl = Align::None()) + : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~R600FrameLowering() override; void emitPrologue(MachineFunction &MF, diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index f80a53ba1dc6..659458b0b752 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -41,6 +41,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" +#include "llvm/Support/MathExtras.h" #include <cassert> #include <cstdint> #include <iterator> @@ -334,8 +335,8 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case R600::MASK_WRITE: { - unsigned maskedRegister = MI.getOperand(0).getReg(); - assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); + Register maskedRegister = MI.getOperand(0).getReg(); + assert(Register::isVirtualRegister(maskedRegister)); MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); TII->addFlag(*defInstr, 0, MO_FLAG_MASK); break; @@ -782,7 +783,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { return TrigVal; // On R600 hw, COS/SIN input must be between -Pi and Pi. return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, - DAG.getConstantFP(3.14159265359, DL, MVT::f32)); + DAG.getConstantFP(numbers::pif, DL, MVT::f32)); } SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index d9e839fe2035..04a5e93f6213 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -97,8 +97,8 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(), E = MBBI->operands_end(); I != E; ++I) { - if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) && - I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg())) + if (I->isReg() && !Register::isVirtualRegister(I->getReg()) && I->isUse() && + RI.isPhysRegLiveAcrossClauses(I->getReg())) return false; } return true; @@ -242,8 +242,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { for (MachineInstr::const_mop_iterator I = MI.operands_begin(), E = MI.operands_end(); I != E; ++I) { - if (!I->isReg() || !I->isUse() || - TargetRegisterInfo::isVirtualRegister(I->getReg())) + if (!I->isReg() || !I->isUse() || Register::isVirtualRegister(I->getReg())) continue; if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg())) @@ -294,7 +293,7 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { for (unsigned j = 0; j < 8; j++) { MachineOperand &MO = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0])); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == R600::ALU_CONST) { MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); @@ -317,7 +316,7 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { if (SrcIdx < 0) break; MachineOperand &MO = MI.getOperand(SrcIdx); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg == R600::ALU_CONST) { MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); @@ -348,7 +347,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, unsigned i = 0; for (const auto &Src : getSrcs(MI)) { ++i; - unsigned Reg = Src.first->getReg(); + Register Reg = Src.first->getReg(); int Index = RI.getEncodingValue(Reg) & 0xff; if (Reg == R600::OQAP) { Result.push_back(std::make_pair(Index, 0U)); @@ -865,7 +864,7 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const { if (idx < 0) return false; - unsigned Reg = MI.getOperand(idx).getReg(); + Register Reg = MI.getOperand(idx).getReg(); switch (Reg) { default: return false; case R600::PRED_SEL_ONE: @@ -1038,7 +1037,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); + Register OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); if (OffsetReg == R600::INDIRECT_BASE_ADDR) { buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(), getIndirectAddrRegClass()->getRegister(Address)); @@ -1052,7 +1051,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); + Register OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); if (OffsetReg == R600::INDIRECT_BASE_ADDR) { buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), MI.getOperand(ValOpIdx).getReg()); @@ -1193,8 +1192,7 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); for (std::pair<unsigned, unsigned> LI : MRI.liveins()) { unsigned Reg = LI.first; - if (TargetRegisterInfo::isVirtualRegister(Reg) || - !IndirectRC->contains(Reg)) + if (Register::isVirtualRegister(Reg) || !IndirectRC->contains(Reg)) continue; unsigned RegIndex; diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp index 34267a909b5e..7569a2629539 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -183,7 +183,7 @@ isPhysicalRegCopy(MachineInstr *MI) { if (MI->getOpcode() != R600::COPY) return false; - return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()); + return !Register::isVirtualRegister(MI->getOperand(1).getReg()); } void R600SchedStrategy::releaseTopNode(SUnit *SU) { @@ -209,7 +209,7 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) { bool R600SchedStrategy::regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!Register::isVirtualRegister(Reg)) { return RC->contains(Reg); } else { return MRI->getRegClass(Reg) == RC; @@ -270,7 +270,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { } // Is the result already member of a X/Y/Z/W class ? - unsigned DestReg = MI->getOperand(0).getReg(); + Register DestReg = MI->getOperand(0).getReg(); if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) || regBelongsToClass(DestReg, &R600::R600_AddrRegClass)) return AluT_X; @@ -357,7 +357,7 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { if (DstIndex == -1) { return; } - unsigned DestReg = MI->getOperand(DstIndex).getReg(); + Register DestReg = MI->getOperand(DstIndex).getReg(); // PressureRegister crashes if an operand is def and used in the same inst // and we try to constraint its regclass for (MachineInstr::mop_iterator It = MI->operands_begin(), diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 9f1cb6582b5c..cec7f563f480 100644 --- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -58,7 +58,7 @@ using namespace llvm; static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { assert(MRI.isSSA()); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return false; const MachineInstr *MI = MRI.getUniqueVRegDef(Reg); return MI && MI->isImplicitDef(); @@ -197,17 +197,17 @@ unsigned getReassignedChan( MachineInstr *R600VectorRegMerger::RebuildVector( RegSeqInfo *RSI, const RegSeqInfo *BaseRSI, const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const { - unsigned Reg = RSI->Instr->getOperand(0).getReg(); + Register Reg = RSI->Instr->getOperand(0).getReg(); MachineBasicBlock::iterator Pos = RSI->Instr; MachineBasicBlock &MBB = *Pos->getParent(); DebugLoc DL = Pos->getDebugLoc(); - unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg(); + Register SrcVec = BaseRSI->Instr->getOperand(0).getReg(); DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan; std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg; for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(), E = RSI->RegToChan.end(); It != E; ++It) { - unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass); + Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass); unsigned SubReg = (*It).first; unsigned Swizzle = (*It).second; unsigned Chan = getReassignedChan(RemapChan, Swizzle); @@ -350,7 +350,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { MachineInstr &MI = *MII; if (MI.getOpcode() != R600::REG_SEQUENCE) { if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { - unsigned Reg = MI.getOperand(1).getReg(); + Register Reg = MI.getOperand(1).getReg(); for (MachineRegisterInfo::def_instr_iterator It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end(); It != E; ++It) { @@ -363,7 +363,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { RegSeqInfo RSI(*MRI, &MI); // All uses of MI are swizzeable ? - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); if (!areAllUsesSwizzeable(Reg)) continue; diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp index df200baf11c1..176269f9b68c 100644 --- a/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/lib/Target/AMDGPU/R600Packetizer.cpp @@ -90,7 +90,7 @@ private: if (DstIdx == -1) { continue; } - unsigned Dst = BI->getOperand(DstIdx).getReg(); + Register Dst = BI->getOperand(DstIdx).getReg(); if (isTrans || TII->isTransOnly(*BI)) { Result[Dst] = R600::PS; continue; @@ -136,7 +136,7 @@ private: int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]); if (OperandIdx < 0) continue; - unsigned Src = MI.getOperand(OperandIdx).getReg(); + Register Src = MI.getOperand(OperandIdx).getReg(); const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src); if (It != PVs.end()) MI.getOperand(OperandIdx).setReg(It->second); diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp index 685df74490fe..ef12c1d24594 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -93,7 +93,7 @@ const RegClassWeight &R600RegisterInfo::getRegClassWeight( } bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { - assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + assert(!Register::isVirtualRegister(Reg)); switch (Reg) { case R600::OQAP: diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp index f8094e35816c..ee011286b8ff 100644 --- a/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -129,7 +129,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { continue; // Create a register for the intialization value. - unsigned PrevDst = + Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); unsigned NewDst = 0; // Final initialized value will be in here @@ -150,7 +150,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); // Initialize dword - unsigned SubReg = + Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) .addImm(0); diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index a0e1ec6ac235..23ef56afc39c 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -99,7 +99,10 @@ enum : uint64_t { FPAtomic = UINT64_C(1) << 53, // Is a MFMA instruction. - IsMAI = UINT64_C(1) << 54 + IsMAI = UINT64_C(1) << 54, + + // Is a DOT instruction. + IsDOT = UINT64_C(1) << 55 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. @@ -444,6 +447,7 @@ namespace DPP { enum DppCtrl : unsigned { QUAD_PERM_FIRST = 0, + QUAD_PERM_ID = 0xE4, // identity permutation QUAD_PERM_LAST = 0xFF, DPP_UNUSED1 = 0x100, ROW_SHL0 = 0x100, diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 624953963cf4..65286751c12d 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -113,10 +113,16 @@ class SIFixSGPRCopies : public MachineFunctionPass { public: static char ID; + MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + SIFixSGPRCopies() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; + void processPHINode(MachineInstr &MI); + StringRef getPassName() const override { return "SI Fix SGPR copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -148,7 +154,7 @@ static bool hasVectorOperands(const MachineInstr &MI, const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (!MI.getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) + !Register::isVirtualRegister(MI.getOperand(i).getReg())) continue; if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg()))) @@ -161,21 +167,19 @@ static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> getCopyRegClasses(const MachineInstr &Copy, const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { - unsigned DstReg = Copy.getOperand(0).getReg(); - unsigned SrcReg = Copy.getOperand(1).getReg(); + Register DstReg = Copy.getOperand(0).getReg(); + Register SrcReg = Copy.getOperand(1).getReg(); - const TargetRegisterClass *SrcRC = - TargetRegisterInfo::isVirtualRegister(SrcReg) ? - MRI.getRegClass(SrcReg) : - TRI.getPhysRegClass(SrcReg); + const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg) + ? MRI.getRegClass(SrcReg) + : TRI.getPhysRegClass(SrcReg); // We don't really care about the subregister here. // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); - const TargetRegisterClass *DstRC = - TargetRegisterInfo::isVirtualRegister(DstReg) ? - MRI.getRegClass(DstReg) : - TRI.getPhysRegClass(DstReg); + const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg) + ? MRI.getRegClass(DstReg) + : TRI.getPhysRegClass(DstReg); return std::make_pair(SrcRC, DstRC); } @@ -199,10 +203,10 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, const SIInstrInfo *TII) { MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); auto &Src = MI.getOperand(1); - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = Src.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - !TargetRegisterInfo::isVirtualRegister(DstReg)) + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = Src.getReg(); + if (!Register::isVirtualRegister(SrcReg) || + !Register::isVirtualRegister(DstReg)) return false; for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { @@ -238,7 +242,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, MachineRegisterInfo &MRI) { assert(MI.isRegSequence()); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) return false; @@ -250,7 +254,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, return false; // It is illegal to have vreg inputs to a physreg defining reg_sequence. - if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg())) + if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg())) return false; const TargetRegisterClass *SrcRC, *DstRC; @@ -281,7 +285,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, bool IsAGPR = TRI->hasAGPRs(DstRC); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { - unsigned SrcReg = MI.getOperand(I).getReg(); + Register SrcReg = MI.getOperand(I).getReg(); unsigned SrcSubReg = MI.getOperand(I).getSubReg(); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); @@ -291,7 +295,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); - unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); + Register TmpReg = MRI.createVirtualRegister(NewSrcRC); BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) @@ -299,7 +303,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, if (IsAGPR) { const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); - unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC); + Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), @@ -315,52 +319,6 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, return true; } -static bool phiHasVGPROperands(const MachineInstr &PHI, - const MachineRegisterInfo &MRI, - const SIRegisterInfo *TRI, - const SIInstrInfo *TII) { - for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { - unsigned Reg = PHI.getOperand(i).getReg(); - if (TRI->hasVGPRs(MRI.getRegClass(Reg))) - return true; - } - return false; -} - -static bool phiHasBreakDef(const MachineInstr &PHI, - const MachineRegisterInfo &MRI, - SmallSet<unsigned, 8> &Visited) { - for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { - unsigned Reg = PHI.getOperand(i).getReg(); - if (Visited.count(Reg)) - continue; - - Visited.insert(Reg); - - MachineInstr *DefInstr = MRI.getVRegDef(Reg); - switch (DefInstr->getOpcode()) { - default: - break; - case AMDGPU::SI_IF_BREAK: - return true; - case AMDGPU::PHI: - if (phiHasBreakDef(*DefInstr, MRI, Visited)) - return true; - } - } - return false; -} - -static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB, - const TargetRegisterInfo &TRI) { - for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(), - E = MBB.end(); I != E; ++I) { - if (I->modifiesRegister(AMDGPU::EXEC, &TRI)) - return true; - } - return false; -} - static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, const MachineInstr *MoveImm, const SIInstrInfo *TII, @@ -422,12 +380,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB, return false; } -static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, - const TargetRegisterInfo *TRI) { - return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { - return hasTerminatorThatModifiesExec(*MBB, *TRI); }); -} - // Checks if there is potential path From instruction To instruction. // If CutOff is specified and it sits in between of that path we ignore // a higher portion of the path and report it is not reachable. @@ -468,6 +420,7 @@ getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { // executioon. static bool hoistAndMergeSGPRInits(unsigned Reg, const MachineRegisterInfo &MRI, + const TargetRegisterInfo *TRI, MachineDominatorTree &MDT, const TargetInstrInfo *TII) { // List of inits by immediate value. @@ -482,7 +435,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, for (auto &MI : MRI.def_instructions(Reg)) { MachineOperand *Imm = nullptr; - for (auto &MO: MI.operands()) { + for (auto &MO : MI.operands()) { if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { Imm = nullptr; @@ -587,8 +540,44 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, } } - for (auto MI : MergedInstrs) - MI->removeFromParent(); + // Remove initializations that were merged into another. + for (auto &Init : Inits) { + auto &Defs = Init.second; + auto I = Defs.begin(); + while (I != Defs.end()) { + if (MergedInstrs.count(*I)) { + (*I)->eraseFromParent(); + I = Defs.erase(I); + } else + ++I; + } + } + + // Try to schedule SGPR initializations as early as possible in the MBB. + for (auto &Init : Inits) { + auto &Defs = Init.second; + for (auto MI : Defs) { + auto MBB = MI->getParent(); + MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); + MachineBasicBlock::reverse_iterator B(BoundaryMI); + // Check if B should actually be a boundary. If not set the previous + // instruction as the boundary instead. + if (!TII->isBasicBlockPrologue(*B)) + B++; + + auto R = std::next(MI->getReverseIterator()); + const unsigned Threshold = 50; + // Search until B or Threshold for a place to insert the initialization. + for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) + if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || + TII->isSchedulingBoundary(*R, MBB, *MBB->getParent())) + break; + + // Move to directly after R. + if (&*--R != MI) + MBB->splice(*R, MBB, MI); + } + } if (Changed) MRI.clearKillFlags(Reg); @@ -598,9 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); + MRI = &MF.getRegInfo(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); MDT = &getAnalysis<MachineDominatorTree>(); SmallVector<MachineInstr *, 16> Worklist; @@ -617,22 +606,39 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { continue; case AMDGPU::COPY: case AMDGPU::WQM: + case AMDGPU::SOFT_WQM: case AMDGPU::WWM: { - // If the destination register is a physical register there isn't really - // much we can do to fix this. - if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) - continue; + Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *SrcRC, *DstRC; - std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); + + if (!Register::isVirtualRegister(DstReg)) { + // If the destination register is a physical register there isn't + // really much we can do to fix this. + // Some special instructions use M0 as an input. Some even only use + // the first lane. Insert a readfirstlane and hope for the best. + if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { + Register TmpReg + = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + BuildMI(MBB, MI, MI.getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) + .add(MI.getOperand(1)); + MI.getOperand(1).setReg(TmpReg); + } + + continue; + } + if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { - unsigned SrcReg = MI.getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) { + Register SrcReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(SrcReg)) { TII->moveToVALU(MI, MDT); break; } - MachineInstr *DefMI = MRI.getVRegDef(SrcReg); + MachineInstr *DefMI = MRI->getVRegDef(SrcReg); unsigned SMovOp; int64_t Imm; // If we are just copying an immediate, we can replace the copy with @@ -651,70 +657,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { break; } case AMDGPU::PHI: { - unsigned Reg = MI.getOperand(0).getReg(); - if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) - break; - - // We don't need to fix the PHI if the common dominator of the - // two incoming blocks terminates with a uniform branch. - bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII); - if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) { - MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); - MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); - - if (!predsHasDivergentTerminator(MBB0, TRI) && - !predsHasDivergentTerminator(MBB1, TRI)) { - LLVM_DEBUG(dbgs() - << "Not fixing PHI for uniform branch: " << MI << '\n'); - break; - } - } - - // If a PHI node defines an SGPR and any of its operands are VGPRs, - // then we need to move it to the VALU. - // - // Also, if a PHI node defines an SGPR and has all SGPR operands - // we must move it to the VALU, because the SGPR operands will - // all end up being assigned the same register, which means - // there is a potential for a conflict if different threads take - // different control flow paths. - // - // For Example: - // - // sgpr0 = def; - // ... - // sgpr1 = def; - // ... - // sgpr2 = PHI sgpr0, sgpr1 - // use sgpr2; - // - // Will Become: - // - // sgpr2 = def; - // ... - // sgpr2 = def; - // ... - // use sgpr2 - // - // The one exception to this rule is when one of the operands - // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK - // instruction. In this case, there we know the program will - // never enter the second block (the loop) without entering - // the first block (where the condition is computed), so there - // is no chance for values to be over-written. - - SmallSet<unsigned, 8> Visited; - if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) { - LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); - TII->moveToVALU(MI, MDT); - } - + processPHINode(MI); break; } case AMDGPU::REG_SEQUENCE: if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || !hasVectorOperands(MI, TRI)) { - foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); + foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI); continue; } @@ -724,9 +673,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { break; case AMDGPU::INSERT_SUBREG: { const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; - DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); - Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); - Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); + DstRC = MRI->getRegClass(MI.getOperand(0).getReg()); + Src0RC = MRI->getRegClass(MI.getOperand(1).getReg()); + Src1RC = MRI->getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && (TRI->hasVectorRegisters(Src0RC) || TRI->hasVectorRegisters(Src1RC))) { @@ -735,12 +684,159 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } break; } + case AMDGPU::V_WRITELANE_B32: { + // Some architectures allow more than one constant bus access without + // SGPR restriction + if (ST.getConstantBusLimit(MI.getOpcode()) != 1) + break; + + // Writelane is special in that it can use SGPR and M0 (which would + // normally count as using the constant bus twice - but in this case it + // is allowed since the lane selector doesn't count as a use of the + // constant bus). However, it is still required to abide by the 1 SGPR + // rule. Apply a fix here as we might have multiple SGPRs after + // legalizing VGPRs to SGPRs + int Src0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + int Src1Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); + + // Check to see if the instruction violates the 1 SGPR rule + if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) && + Src0.getReg() != AMDGPU::M0) && + (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) && + Src1.getReg() != AMDGPU::M0)) { + + // Check for trivially easy constant prop into one of the operands + // If this is the case then perform the operation now to resolve SGPR + // issue. If we don't do that here we will always insert a mov to m0 + // that can't be resolved in later operand folding pass + bool Resolved = false; + for (MachineOperand *MO : {&Src0, &Src1}) { + if (Register::isVirtualRegister(MO->getReg())) { + MachineInstr *DefMI = MRI->getVRegDef(MO->getReg()); + if (DefMI && TII->isFoldableCopy(*DefMI)) { + const MachineOperand &Def = DefMI->getOperand(0); + if (Def.isReg() && + MO->getReg() == Def.getReg() && + MO->getSubReg() == Def.getSubReg()) { + const MachineOperand &Copied = DefMI->getOperand(1); + if (Copied.isImm() && + TII->isInlineConstant(APInt(64, Copied.getImm(), true))) { + MO->ChangeToImmediate(Copied.getImm()); + Resolved = true; + break; + } + } + } + } + } + + if (!Resolved) { + // Haven't managed to resolve by replacing an SGPR with an immediate + // Move src1 to be in M0 + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::COPY), AMDGPU::M0) + .add(Src1); + Src1.ChangeToRegister(AMDGPU::M0, false); + } + } + break; + } } } } if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) - hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII); + hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); return true; } + +void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { + unsigned numVGPRUses = 0; + bool AllAGPRUses = true; + SetVector<const MachineInstr *> worklist; + SmallSet<const MachineInstr *, 4> Visited; + worklist.insert(&MI); + Visited.insert(&MI); + while (!worklist.empty()) { + const MachineInstr *Instr = worklist.pop_back_val(); + unsigned Reg = Instr->getOperand(0).getReg(); + for (const auto &Use : MRI->use_operands(Reg)) { + const MachineInstr *UseMI = Use.getParent(); + AllAGPRUses &= (UseMI->isCopy() && + TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) || + TRI->isAGPR(*MRI, Use.getReg()); + if (UseMI->isCopy() || UseMI->isRegSequence()) { + if (UseMI->isCopy() && + UseMI->getOperand(0).getReg().isPhysical() && + !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) { + numVGPRUses++; + } + if (Visited.insert(UseMI).second) + worklist.insert(UseMI); + + continue; + } + + if (UseMI->isPHI()) { + const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg()); + if (!TRI->isSGPRReg(*MRI, Use.getReg()) && + UseRC != &AMDGPU::VReg_1RegClass) + numVGPRUses++; + continue; + } + + const TargetRegisterClass *OpRC = + TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use)); + if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass && + OpRC != &AMDGPU::VS_64RegClass) { + numVGPRUses++; + } + } + } + + Register PHIRes = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); + if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) { + LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); + MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); + } + + bool hasVGPRInput = false; + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + unsigned InputReg = MI.getOperand(i).getReg(); + MachineInstr *Def = MRI->getVRegDef(InputReg); + if (TRI->isVectorRegister(*MRI, InputReg)) { + if (Def->isCopy()) { + unsigned SrcReg = Def->getOperand(1).getReg(); + const TargetRegisterClass *RC = + TRI->getRegClassForReg(*MRI, SrcReg); + if (TRI->isSGPRClass(RC)) + continue; + } + hasVGPRInput = true; + break; + } + else if (Def->isCopy() && + TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) { + hasVGPRInput = true; + break; + } + } + + if ((!TRI->isVectorRegister(*MRI, PHIRes) && + RC0 != &AMDGPU::VReg_1RegClass) && + (hasVGPRInput || numVGPRUses > 1)) { + LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); + TII->moveToVALU(MI); + } + else { + LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); + TII->legalizeOperands(MI, MDT); + } + +} diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp index 5b834c8de13a..a0119297b112 100644 --- a/lib/Target/AMDGPU/SIFixupVectorISel.cpp +++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp @@ -91,8 +91,7 @@ static bool findSRegBaseAndIndex(MachineOperand *Op, Worklist.push_back(Op); while (!Worklist.empty()) { MachineOperand *WOp = Worklist.pop_back_val(); - if (!WOp->isReg() || - !TargetRegisterInfo::isVirtualRegister(WOp->getReg())) + if (!WOp->isReg() || !Register::isVirtualRegister(WOp->getReg())) continue; MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg()); switch (DefInst->getOpcode()) { diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 74d77d328019..4eac03168760 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -142,16 +142,20 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, switch (Opc) { case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: - case AMDGPU::V_FMAC_F32_e64: { + case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMAC_F16_e64: { // Special case for mac. Since this is replaced with mad when folded into // src2, we need to check the legality for the final instruction. int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (static_cast<int>(OpNo) == Src2Idx) { - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; - bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64; + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64; unsigned Opc = IsFMA ? - AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) : + (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); const MCInstrDesc &MadDesc = TII->get(Opc); return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); } @@ -235,9 +239,11 @@ static bool updateOperand(FoldCandidate &Fold, if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { MachineBasicBlock *MBB = MI->getParent(); - auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); - if (Liveness != MachineBasicBlock::LQR_Dead) + auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI, 16); + if (Liveness != MachineBasicBlock::LQR_Dead) { + LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n"); return false; + } MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); int Op32 = Fold.getShrinkOpcode(); @@ -248,7 +254,7 @@ static bool updateOperand(FoldCandidate &Fold, bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); - unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); + Register NewReg0 = MRI.createVirtualRegister(Dst0RC); MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); @@ -314,12 +320,15 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64) && + Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; - bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64; + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64; unsigned NewOpc = IsFMA ? - AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) : + (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); // Check if changing this to a v_mad_{f16, f32} instruction will allow us // to fold the operand. @@ -435,7 +444,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) return false; - if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) { + if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && + TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); return true; } @@ -443,8 +453,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, if (!OpToFold.isReg()) return false; - unsigned UseReg = OpToFold.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(UseReg)) + Register UseReg = OpToFold.getReg(); + if (!Register::isVirtualRegister(UseReg)) return false; if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) { @@ -481,6 +491,9 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, return false; // Can only fold splat constants } + if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op)) + return false; + FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); return true; } @@ -518,7 +531,7 @@ void SIFoldOperands::foldOperand( // REG_SEQUENCE instructions, so we have to fold them into the // uses of REG_SEQUENCE. if (UseMI->isRegSequence()) { - unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); + Register RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); MachineRegisterInfo::use_iterator Next; @@ -569,15 +582,18 @@ void SIFoldOperands::foldOperand( OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); if (FoldingImmLike && UseMI->isCopy()) { - unsigned DestReg = UseMI->getOperand(0).getReg(); - const TargetRegisterClass *DestRC - = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI->getRegClass(DestReg) : - TRI->getPhysRegClass(DestReg); - - unsigned SrcReg = UseMI->getOperand(1).getReg(); - if (TargetRegisterInfo::isVirtualRegister(DestReg) && - TargetRegisterInfo::isVirtualRegister(SrcReg)) { + Register DestReg = UseMI->getOperand(0).getReg(); + + // Don't fold into a copy to a physical register. Doing so would interfere + // with the register coalescer's logic which would avoid redundant + // initalizations. + if (DestReg.isPhysical()) + return; + + const TargetRegisterClass *DestRC = MRI->getRegClass(DestReg); + + Register SrcReg = UseMI->getOperand(1).getReg(); + if (SrcReg.isVirtual()) { // XXX - This can be an assert? const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { MachineRegisterInfo::use_iterator NextUse; @@ -613,10 +629,17 @@ void SIFoldOperands::foldOperand( return; UseMI->setDesc(TII->get(MovOp)); + MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin(); + MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end(); + while (ImpOpI != ImpOpE) { + MachineInstr::mop_iterator Tmp = ImpOpI; + ImpOpI++; + UseMI->RemoveOperand(UseMI->getOperandNo(Tmp)); + } CopiesToReplace.push_back(UseMI); } else { if (UseMI->isCopy() && OpToFold.isReg() && - TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && + Register::isVirtualRegister(UseMI->getOperand(0).getReg()) && TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) && TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) && !UseMI->getOperand(1).getSubReg()) { @@ -677,6 +700,9 @@ void SIFoldOperands::foldOperand( // => // %sgpr1 = COPY %sgpr0 UseMI->setDesc(TII->get(AMDGPU::COPY)); + UseMI->getOperand(1).setReg(OpToFold.getReg()); + UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); + UseMI->getOperand(1).setIsKill(false); UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) return; } @@ -708,7 +734,7 @@ void SIFoldOperands::foldOperand( // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { - unsigned UseReg = UseOp.getReg(); + Register UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) @@ -810,7 +836,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, if (Op.isReg()) { // If this has a subregister, it obviously is a register source. if (Op.getSubReg() != AMDGPU::NoSubRegister || - !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + !Register::isVirtualRegister(Op.getReg())) return &Op; MachineInstr *Def = MRI.getVRegDef(Op.getReg()); @@ -1073,6 +1099,13 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { + if (Fold.isReg() && Register::isVirtualRegister(Fold.OpToFold->getReg())) { + Register Reg = Fold.OpToFold->getReg(); + MachineInstr *DefMI = Fold.OpToFold->getParent(); + if (DefMI->readsRegister(AMDGPU::EXEC, TRI) && + execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI)) + continue; + } if (updateOperand(Fold, *TII, *TRI, *ST)) { // Clear kill flags. if (Fold.isReg()) { @@ -1316,6 +1349,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock *MBB : depth_first(&MF)) { MachineBasicBlock::iterator I, Next; + + MachineOperand *CurrentKnownM0Val = nullptr; for (I = MBB->begin(); I != MBB->end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; @@ -1328,6 +1363,25 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || !tryFoldOMod(MI)) tryFoldClamp(MI); + + // Saw an unknown clobber of m0, so we no longer know what it is. + if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI)) + CurrentKnownM0Val = nullptr; + continue; + } + + // Specially track simple redefs of m0 to the same value in a block, so we + // can erase the later ones. + if (MI.getOperand(0).getReg() == AMDGPU::M0) { + MachineOperand &NewM0Val = MI.getOperand(1); + if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) { + MI.eraseFromParent(); + continue; + } + + // We aren't tracking other physical registers + CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) ? + nullptr : &NewM0Val; continue; } @@ -1339,8 +1393,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (!FoldingImm && !OpToFold.isReg()) continue; - if (OpToFold.isReg() && - !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) + if (OpToFold.isReg() && !Register::isVirtualRegister(OpToFold.getReg())) continue; // Prevent folding operands backwards in the function. For example, @@ -1350,8 +1403,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { // ... // %vgpr0 = V_MOV_B32_e32 1, implicit %exec MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && - !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + if (Dst.isReg() && !Register::isVirtualRegister(Dst.getReg())) continue; foldInstOperand(MI, OpToFold); diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index f3c9ad63a80a..26bae5734df7 100644 --- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -120,7 +120,7 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) { return false; // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it. for (const MachineOperand &ResMO : MI.defs()) { - unsigned ResReg = ResMO.getReg(); + Register ResReg = ResMO.getReg(); for (const MachineOperand &MO : MI.uses()) { if (!MO.isReg() || MO.isDef()) continue; @@ -144,7 +144,7 @@ static unsigned getMopState(const MachineOperand &MO) { S |= RegState::Kill; if (MO.isEarlyClobber()) S |= RegState::EarlyClobber; - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable()) + if (Register::isPhysicalRegister(MO.getReg()) && MO.isRenamable()) S |= RegState::Renamable; return S; } @@ -152,7 +152,7 @@ static unsigned getMopState(const MachineOperand &MO) { template <typename Callable> void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask, Callable Func) const { - if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) || + if (LaneMask.all() || Register::isPhysicalRegister(Reg) || LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) { Func(0); return; @@ -216,7 +216,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); // If it is tied we will need to write same register as we read. if (MO.isTied()) @@ -227,7 +227,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, if (Conflict == Map.end()) continue; - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return false; LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); @@ -265,13 +265,13 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI, for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!Reg) continue; - LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ? - TRI->getSubRegIndexLaneMask(MO.getSubReg()) : - LaneBitmask::getAll(); + LaneBitmask Mask = Register::isVirtualRegister(Reg) + ? TRI->getSubRegIndexLaneMask(MO.getSubReg()) + : LaneBitmask::getAll(); RegUse &Map = MO.isDef() ? Defs : Uses; auto Loc = Map.find(Reg); @@ -389,7 +389,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { for (auto &&R : Defs) { unsigned Reg = R.first; Uses.erase(Reg); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) continue; LIS->removeInterval(Reg); LIS->createAndComputeVirtRegInterval(Reg); @@ -397,7 +397,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { for (auto &&R : Uses) { unsigned Reg = R.first; - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) continue; LIS->removeInterval(Reg); LIS->createAndComputeVirtRegInterval(Reg); diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index feab6bed2603..ed07ed100a19 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -112,6 +112,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); return; } @@ -132,6 +133,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); } @@ -157,6 +159,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); return; } @@ -177,6 +180,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); } @@ -202,15 +206,15 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); - unsigned FlatScratchInitReg - = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); + Register FlatScratchInitReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.addLiveIn(FlatScratchInitReg); MBB.addLiveIn(FlatScratchInitReg); - unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); - unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); + Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); @@ -424,8 +428,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); // We need to insert initialization of the scratch resource descriptor. - unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; if (ST.isAmdHsaOrMesa(F)) { @@ -539,9 +543,9 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, if (ST.isAmdPalOS()) { // The pointer to the GIT is formed from the offset passed in and either // the amdgpu-git-ptr-high function attribute or the top part of the PC - unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -601,14 +605,14 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); if (MFI->hasImplicitBufferPtr()) { - unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); @@ -640,8 +644,8 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR()); } } else { - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); BuildMI(MBB, I, DL, SMovB32, Rsrc0) .addExternalSymbol("SCRATCH_RSRC_DWORD0") @@ -669,6 +673,8 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::NoAlloc: case TargetStackID::SGPRSpill: return true; + case TargetStackID::SVEVector: + return false; } llvm_unreachable("Invalid TargetStackID::Value"); } diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index c644f4726e2c..d9970fd6b4b8 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -20,9 +20,9 @@ class GCNSubtarget; class SIFrameLowering final : public AMDGPUFrameLowering { public: - SIFrameLowering(StackDirection D, unsigned StackAl, int LAO, - unsigned TransAl = 1) : - AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} + SIFrameLowering(StackDirection D, Align StackAl, int LAO, + Align TransAl = Align::None()) + : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~SIFrameLowering() override = default; void emitEntryFunctionPrologue(MachineFunction &MF, diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index db0782e2bf3e..56ebf9c06741 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -20,11 +20,11 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -35,6 +35,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/DAGCombine.h" @@ -44,6 +45,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" @@ -115,7 +117,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); @@ -125,10 +127,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); - addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); - addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); + addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); @@ -141,12 +143,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); if (Subtarget->has16BitInsts()) { - addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); - addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); // Unless there are also VOP3P operations, not operations are really legal. - addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); - addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } @@ -178,6 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v32i32, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); @@ -215,31 +218,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); - - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); - - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); @@ -653,6 +635,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FADD, MVT::v4f16, Custom); setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::FMA, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); @@ -687,6 +670,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, VT, Custom); } + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); + setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::ADDCARRY); setTargetDAGCombine(ISD::SUB); @@ -768,19 +778,22 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // TODO: Consider splitting all arguments into 32-bit pieces. - if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + if (CC == CallingConv::AMDGPU_KERNEL) + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); + + if (VT.isVector()) { EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); if (Size == 32) return ScalarVT.getSimpleVT(); - if (Size == 64) + if (Size > 32) return MVT::i32; if (Size == 16 && Subtarget->has16BitInsts()) return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; - } + } else if (VT.getSizeInBits() > 32) + return MVT::i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -788,7 +801,10 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + if (CC == CallingConv::AMDGPU_KERNEL) + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); + + if (VT.isVector()) { unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); @@ -796,12 +812,13 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, if (Size == 32) return NumElts; - if (Size == 64) - return 2 * NumElts; + if (Size > 32) + return NumElts * ((Size + 31) / 32); if (Size == 16 && Subtarget->has16BitInsts()) - return (VT.getVectorNumElements() + 1) / 2; - } + return (NumElts + 1) / 2; + } else if (VT.getSizeInBits() > 32) + return (VT.getSizeInBits() + 31) / 32; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } @@ -821,10 +838,10 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( return NumIntermediates; } - if (Size == 64) { + if (Size > 32) { RegisterVT = MVT::i32; IntermediateVT = RegisterVT; - NumIntermediates = 2 * NumElts; + NumIntermediates = NumElts * ((Size + 31) / 32); return NumIntermediates; } @@ -901,7 +918,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = MFI->getImagePSV( *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), CI.getArgOperand(RsrcIntr->RsrcArg)); - Info.align = 0; + Info.align.reset(); } else { Info.ptrVal = MFI->getBufferPSV( *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), @@ -947,7 +964,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4)); @@ -964,7 +981,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = MFI->getBufferPSV( *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), CI.getArgOperand(1)); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); @@ -978,7 +995,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(CI.getOperand(0)->getType() ->getPointerElementType()); Info.ptrVal = CI.getOperand(0); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; @@ -988,7 +1005,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1)); @@ -1012,7 +1029,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, // This is an abstract access, but we need to specify a type and size. Info.memVT = MVT::i32; Info.size = 4; - Info.align = 4; + Info.align = Align(4); Info.flags = MachineMemOperand::MOStore; if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) @@ -1215,21 +1232,12 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, return true; } -bool SITargetLowering::allowsMisalignedMemoryAccesses( - EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, - bool *IsFast) const { +bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( + unsigned Size, unsigned AddrSpace, unsigned Align, + MachineMemOperand::Flags Flags, bool *IsFast) const { if (IsFast) *IsFast = false; - // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, - // which isn't a simple VT. - // Until MVT is extended to handle this, simply check for the size and - // rely on the condition below: allow accesses if the size is a multiple of 4. - if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && - VT.getStoreSize() > 16)) { - return false; - } - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) { // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte @@ -1268,7 +1276,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( } // Smaller than dword value must be aligned. - if (VT.bitsLT(MVT::i32)) + if (Size < 32) return false; // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the @@ -1277,7 +1285,26 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( if (IsFast) *IsFast = true; - return VT.bitsGT(MVT::i32) && Align % 4 == 0; + return Size >= 32 && Align >= 4; +} + +bool SITargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *IsFast) const { + if (IsFast) + *IsFast = false; + + // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, + // which isn't a simple VT. + // Until MVT is extended to handle this, simply check for the size and + // rely on the condition below: allow accesses if the size is a multiple of 4. + if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && + VT.getStoreSize() > 16)) { + return false; + } + + return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, + Align, Flags, IsFast); } EVT SITargetLowering::getOptimalMemOpType( @@ -1336,9 +1363,9 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const { TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(MVT VT) const { - if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) - return TypeSplitVector; - + int NumElts = VT.getVectorNumElements(); + if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16)) + return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } @@ -1562,7 +1589,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, // entire split argument. if (Arg->Flags.isSplit()) { while (!Arg->Flags.isSplitEnd()) { - assert(!Arg->VT.isVector() && + assert((!Arg->VT.isVector() || + Arg->VT.getScalarSizeInBits() == 16) && "unexpected vector split in ps argument type"); if (!SkipArg) Splits.push_back(*Arg); @@ -1589,29 +1617,32 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, } // Allocate special inputs passed in VGPRs. -static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { + const LLT S32 = LLT::scalar(32); + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (Info.hasWorkItemIDX()) { - unsigned Reg = AMDGPU::VGPR0; - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register Reg = AMDGPU::VGPR0; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); } if (Info.hasWorkItemIDY()) { - unsigned Reg = AMDGPU::VGPR1; - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register Reg = AMDGPU::VGPR1; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); } if (Info.hasWorkItemIDZ()) { - unsigned Reg = AMDGPU::VGPR2; - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register Reg = AMDGPU::VGPR2; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); @@ -1642,7 +1673,8 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, assert(Reg != AMDGPU::NoRegister); MachineFunction &MF = CCInfo.getMachineFunction(); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32)); return ArgDescriptor::createRegister(Reg, Mask); } @@ -1671,10 +1703,10 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); } -static void allocateSpecialInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { const unsigned Mask = 0x3ff; ArgDescriptor Arg; @@ -1692,10 +1724,11 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo, Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } -static void allocateSpecialInputSGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialInputSGPRs( + CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { auto &ArgInfo = Info.getArgInfo(); // TODO: Unify handling with private memory pointers. @@ -1728,10 +1761,10 @@ static void allocateSpecialInputSGPRs(CCState &CCInfo, } // Allocate special inputs passed in user SGPRs. -static void allocateHSAUserSGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { if (Info.hasImplicitBufferPtr()) { unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); @@ -1758,9 +1791,12 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, } if (Info.hasKernargSegmentPtr()) { - unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI); - MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register InputPtrReg = Info.addKernargSegmentPtr(TRI); CCInfo.AllocateReg(InputPtrReg); + + Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); + MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); } if (Info.hasDispatchID()) { @@ -1780,32 +1816,32 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, } // Allocate special input registers that are initialized per-wave. -static void allocateSystemSGPRs(CCState &CCInfo, - MachineFunction &MF, - SIMachineFunctionInfo &Info, - CallingConv::ID CallConv, - bool IsShader) { +void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, + MachineFunction &MF, + SIMachineFunctionInfo &Info, + CallingConv::ID CallConv, + bool IsShader) const { if (Info.hasWorkGroupIDX()) { unsigned Reg = Info.addWorkGroupIDX(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDY()) { unsigned Reg = Info.addWorkGroupIDY(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDZ()) { unsigned Reg = Info.addWorkGroupIDZ(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupInfo()) { unsigned Reg = Info.addWorkGroupInfo(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } @@ -1860,7 +1896,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. - unsigned PrivateSegmentBufferReg = + Register PrivateSegmentBufferReg = Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); Info.setScratchRSrcReg(PrivateSegmentBufferReg); } else { @@ -1921,7 +1957,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // // FIXME: Should not do this if inline asm is reading/writing these // registers. - unsigned PreloadedSP = Info.getPreloadedReg( + Register PreloadedSP = Info.getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setStackPtrOffsetReg(PreloadedSP); @@ -1971,7 +2007,7 @@ void SITargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) @@ -2134,7 +2170,7 @@ SDValue SITargetLowering::LowerFormalArguments( assert(VA.isRegLoc() && "Parameter must be in a register!"); - unsigned Reg = VA.getLocReg(); + Register Reg = VA.getLocReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); EVT ValVT = VA.getValVT(); @@ -2652,6 +2688,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsThisReturn = false; MachineFunction &MF = DAG.getMachineFunction(); + if (Callee.isUndef() || isNullConstant(Callee)) { + if (!CLI.IsTailCall) { + for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) + InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + } + + return Chain; + } + if (IsVarArg) { return lowerUnhandledCall(CLI, InVals, "unsupported call to variadic function "); @@ -2782,7 +2827,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, int32_t Offset = LocMemOffset; SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT); - unsigned Align = 0; + MaybeAlign Alignment; if (IsTailCall) { ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; @@ -2790,8 +2835,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Flags.getByValSize() : VA.getValVT().getStoreSize(); // FIXME: We can have better than the minimum byval required alignment. - Align = Flags.isByVal() ? Flags.getByValAlign() : - MinAlign(Subtarget->getStackAlignment(), Offset); + Alignment = + Flags.isByVal() + ? MaybeAlign(Flags.getByValAlign()) + : commonAlignment(Subtarget->getStackAlignment(), Offset); Offset = Offset + FPDiff; int FI = MFI.CreateFixedObject(OpSize, Offset, true); @@ -2810,7 +2857,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } else { DstAddr = PtrOff; DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); - Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset); + Alignment = + commonAlignment(Subtarget->getStackAlignment(), LocMemOffset); } if (Outs[i].Flags.isByVal()) { @@ -2825,7 +2873,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MemOpChains.push_back(Cpy); } else { - SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align); + SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, + Alignment ? Alignment->value() : 0); MemOpChains.push_back(Store); } } @@ -2937,9 +2986,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, IsThisReturn ? OutVals[0] : SDValue()); } -unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { - unsigned Reg = StringSwitch<unsigned>(RegName) +Register SITargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const { + Register Reg = StringSwitch<Register>(RegName) .Case("m0", AMDGPU::M0) .Case("exec", AMDGPU::EXEC) .Case("exec_lo", AMDGPU::EXEC_LO) @@ -2947,7 +2996,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) - .Default(AMDGPU::NoRegister); + .Default(Register()); if (Reg == AMDGPU::NoRegister) { report_fatal_error(Twine("invalid register name \"" @@ -3055,6 +3104,20 @@ splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { return std::make_pair(LoopBB, RemainderBB); } +/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it. +void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + auto I = MI.getIterator(); + auto E = std::next(I); + + BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + MIBundleBuilder Bundler(*MBB, I, E); + finalizeBundle(*MBB, Bundler.begin()); +} + MachineBasicBlock * SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -3066,12 +3129,13 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *RemainderBB; const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - MachineBasicBlock::iterator Prev = std::prev(MI.getIterator()); + // Apparently kill flags are only valid if the def is in the same block? + if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) + Src->setIsKill(false); std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); MachineBasicBlock::iterator I = LoopBB->end(); - MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0); const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); @@ -3081,23 +3145,9 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, .addImm(0) .addImm(EncodedReg); - // This is a pain, but we're not allowed to have physical register live-ins - // yet. Insert a pair of copies if the VGPR0 hack is necessary. - if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) { - unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0) - .add(*Src); + bundleInstWithWaitcnt(MI); - BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg()) - .addReg(Data0); - - MRI.setSimpleHint(Data0, Src->getReg()); - } - - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); - - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); // Load and check TRAP_STS.MEM_VIOL BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) @@ -3138,10 +3188,10 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( MachineBasicBlock::iterator I = LoopBB.begin(); const TargetRegisterClass *BoolRC = TRI->getBoolRC(); - unsigned PhiExec = MRI.createVirtualRegister(BoolRC); - unsigned NewExec = MRI.createVirtualRegister(BoolRC); - unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned CondReg = MRI.createVirtualRegister(BoolRC); + Register PhiExec = MRI.createVirtualRegister(BoolRC); + Register NewExec = MRI.createVirtualRegister(BoolRC); + Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register CondReg = MRI.createVirtualRegister(BoolRC); BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) .addReg(InitReg) @@ -3240,9 +3290,9 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock::iterator I(&MI); const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); - unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC); + Register DstReg = MI.getOperand(0).getReg(); + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register TmpExec = MRI.createVirtualRegister(BoolXExecRC); unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; @@ -3315,7 +3365,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, SetOn->getOperand(3).setIsUndef(); } else { - unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) .add(*Idx) .addImm(Offset); @@ -3351,8 +3401,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); @@ -3390,8 +3440,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); @@ -3442,7 +3492,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Dst = MI.getOperand(0).getReg(); + Register Dst = MI.getOperand(0).getReg(); const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); @@ -3505,7 +3555,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); - unsigned PhiReg = MRI.createVirtualRegister(VecRC); + Register PhiReg = MRI.createVirtualRegister(VecRC); auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset, UseGPRIdxMode, false); @@ -3564,22 +3614,22 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineOperand &Src0 = MI.getOperand(1); MachineOperand &Src1 = MI.getOperand(2); - unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, Src0, BoolRC, AMDGPU::sub0, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, Src0, BoolRC, AMDGPU::sub1, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, Src1, BoolRC, AMDGPU::sub0, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, Src1, BoolRC, AMDGPU::sub1, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); @@ -3632,8 +3682,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( // S_CMOV_B64 exec, -1 MachineInstr *FirstMI = &*BB->begin(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned InputReg = MI.getOperand(0).getReg(); - unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register InputReg = MI.getOperand(0).getReg(); + Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); bool Found = false; // Move the COPY of the input reg to the beginning, so that we can use it. @@ -3707,16 +3757,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src0 = MI.getOperand(1).getReg(); - unsigned Src1 = MI.getOperand(2).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned SrcCond = MI.getOperand(3).getReg(); + Register SrcCond = MI.getOperand(3).getReg(); - unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC); + Register SrcCondCopy = MRI.createVirtualRegister(CondRC); BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) .addReg(SrcCond); @@ -3814,8 +3864,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: case AMDGPU::DS_GWS_BARRIER: - if (getSubtarget()->hasGWSAutoReplay()) + // A s_waitcnt 0 is required to be the instruction immediately following. + if (getSubtarget()->hasGWSAutoReplay()) { + bundleInstWithWaitcnt(MI); return BB; + } + return emitGWSMemViolTestLoop(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); @@ -3939,6 +3993,30 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); } +SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4i16 || VT == MVT::v4f16); + + SDValue Lo0, Hi0; + std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Lo1, Hi1; + std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); + SDValue Lo2, Hi2; + std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2); + + SDLoc SL(Op); + + SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + + SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -3991,6 +4069,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMINNUM: case ISD::FMAXNUM: return lowerFMINNUM_FMAXNUM(Op, DAG); + case ISD::FMA: + return splitTernaryVectorOp(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: @@ -4070,6 +4150,41 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); } +SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, + SelectionDAG &DAG, + ArrayRef<SDValue> Ops) const { + SDLoc DL(M); + EVT LoadVT = M->getValueType(0); + EVT EltType = LoadVT.getScalarType(); + EVT IntVT = LoadVT.changeTypeToInteger(); + + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); + + unsigned Opc = + IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD; + + if (IsD16) { + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + } + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + if (isTypeLegal(LoadVT)) { + return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); + } + + EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT); + SDVTList VTList = DAG.getVTList(CastVT, MVT::Other); + SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT, + M->getMemOperand(), DAG); + return DAG.getMergeValues( + {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)}, + DL); +} + static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -4196,8 +4311,14 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::INTRINSIC_W_CHAIN: { if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { - Results.push_back(Res); - Results.push_back(Res.getValue(1)); + if (Res.getOpcode() == ISD::MERGE_VALUES) { + // FIXME: Hacky + Results.push_back(Res.getOperand(0)); + Results.push_back(Res.getOperand(1)); + } else { + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } return; } @@ -4935,11 +5056,8 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to // compute the correct address. - unsigned LoFlags = GAFlags; - if (LoFlags == SIInstrInfo::MO_NONE) - LoFlags = SIInstrInfo::MO_REL32; SDValue PtrLo = - DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags); + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags); SDValue PtrHi; if (GAFlags == SIInstrInfo::MO_NONE) { PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); @@ -5563,14 +5681,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue(); SDValue Ops[] = { - DAG.getEntryNode(), // Chain - Rsrc, // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - {}, // voffset - {}, // soffset - {}, // offset - DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + DAG.getEntryNode(), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + {}, // voffset + {}, // soffset + {}, // offset + DAG.getTargetConstant(CachePolicy, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; // Use the alignment to ensure that the required offsets will fit into the @@ -5579,7 +5697,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); for (unsigned i = 0; i < NumLoads; ++i) { - Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32); + Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32); Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, LoadVT, MMO)); } @@ -5758,45 +5876,31 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); - case Intrinsic::amdgcn_interp_mov: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Glue); - } - case Intrinsic::amdgcn_interp_p1: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Glue); - } - case Intrinsic::amdgcn_interp_p2: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); - SDValue Glue = SDValue(M0.getNode(), 1); - return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), - Glue); - } case Intrinsic::amdgcn_interp_p1_f16: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); - SDValue Glue = M0.getValue(1); + SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, + Op.getOperand(5), SDValue()); if (getSubtarget()->getLDSBankCount() == 16) { // 16 bank LDS - SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, - DAG.getConstant(2, DL, MVT::i32), // P0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - Glue); + + // FIXME: This implicitly will insert a second CopyToReg to M0. + SDValue S = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32, + DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + Op.getOperand(5)); // m0 + SDValue Ops[] = { Op.getOperand(1), // Src0 Op.getOperand(2), // Attrchan Op.getOperand(3), // Attr - DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers S, // Src2 - holds two f16 values selected by high - DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers Op.getOperand(4), // high - DAG.getConstant(0, DL, MVT::i1), // $clamp - DAG.getConstant(0, DL, MVT::i32) // $omod + DAG.getTargetConstant(0, DL, MVT::i1), // $clamp + DAG.getTargetConstant(0, DL, MVT::i32) // $omod }; return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); } else { @@ -5805,28 +5909,28 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), // Src0 Op.getOperand(2), // Attrchan Op.getOperand(3), // Attr - DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers Op.getOperand(4), // high - DAG.getConstant(0, DL, MVT::i1), // $clamp - DAG.getConstant(0, DL, MVT::i32), // $omod - Glue + DAG.getTargetConstant(0, DL, MVT::i1), // $clamp + DAG.getTargetConstant(0, DL, MVT::i32), // $omod + ToM0.getValue(1) }; return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); } } case Intrinsic::amdgcn_interp_p2_f16: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6)); - SDValue Glue = SDValue(M0.getNode(), 1); + SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, + Op.getOperand(6), SDValue()); SDValue Ops[] = { Op.getOperand(2), // Src0 Op.getOperand(3), // Attrchan Op.getOperand(4), // Attr - DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers Op.getOperand(1), // Src2 - DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers Op.getOperand(5), // high - DAG.getConstant(0, DL, MVT::i1), // $clamp - Glue + DAG.getTargetConstant(0, DL, MVT::i1), // $clamp + ToM0.getValue(1) }; return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops); } @@ -5947,16 +6051,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2)); return DAG.getNode(ISD::BITCAST, DL, VT, Node); } - case Intrinsic::amdgcn_wqm: { - SDValue Src = Op.getOperand(1); - return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src), - 0); - } - case Intrinsic::amdgcn_wwm: { - SDValue Src = Op.getOperand(1); - return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src), - 0); - } case Intrinsic::amdgcn_fmad_ftz: return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -5977,6 +6071,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SIInstrInfo::MO_ABS32_LO); return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; } + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: { + SDLoc SL(Op); + unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ? + AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; + SDValue Aperture = getSegmentAperture(AS, SL, DAG); + SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, + Op.getOperand(1)); + + SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec, + DAG.getConstant(1, SL, MVT::i32)); + return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -5986,6 +6093,30 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } +// This function computes an appropriate offset to pass to +// MachineMemOperand::setOffset() based on the offset inputs to +// an intrinsic. If any of the offsets are non-contstant or +// if VIndex is non-zero then this function returns 0. Otherwise, +// it returns the sum of VOffset, SOffset, and Offset. +static unsigned getBufferOffsetForMMO(SDValue VOffset, + SDValue SOffset, + SDValue Offset, + SDValue VIndex = SDValue()) { + + if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) || + !isa<ConstantSDNode>(Offset)) + return 0; + + if (VIndex) { + if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue()) + return 0; + } + + return cast<ConstantSDNode>(VOffset)->getSExtValue() + + cast<ConstantSDNode>(SOffset)->getSExtValue() + + cast<ConstantSDNode>(Offset)->getSExtValue(); +} + SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); @@ -6128,17 +6259,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); + unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; + unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); EVT LoadVT = Op.getValueType(); if (LoadVT.getScalarType() == MVT::f16) @@ -6155,6 +6291,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: { + const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format; + auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6163,32 +6301,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(4), // soffset Offsets.second, // offset - Op.getOperand(5), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + Op.getOperand(5), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5])); + return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); } case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: { + const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format; + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6197,29 +6321,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5], + Ops[2])); + return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -6239,9 +6348,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(4), // voffset Op.getOperand(5), // soffset Op.getOperand(6), // offset - DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -6264,8 +6373,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(4), // soffset Offsets.second, // offset Op.getOperand(5), // format - Op.getOperand(6), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -6288,8 +6397,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -6321,13 +6430,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); unsigned Opcode = 0; switch (IntrID) { @@ -6377,7 +6490,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_umax: case Intrinsic::amdgcn_raw_buffer_atomic_and: case Intrinsic::amdgcn_raw_buffer_atomic_or: - case Intrinsic::amdgcn_raw_buffer_atomic_xor: { + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_buffer_atomic_dec: { auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6388,11 +6503,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); unsigned Opcode = 0; switch (IntrID) { @@ -6426,6 +6542,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + Opcode = AMDGPUISD::BUFFER_ATOMIC_INC; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC; + break; default: llvm_unreachable("unhandled atomic opcode"); } @@ -6442,7 +6564,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_umax: case Intrinsic::amdgcn_struct_buffer_atomic_and: case Intrinsic::amdgcn_struct_buffer_atomic_or: - case Intrinsic::amdgcn_struct_buffer_atomic_xor: { + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: { auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6453,11 +6577,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], + Ops[3])); unsigned Opcode = 0; switch (IntrID) { @@ -6491,6 +6617,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + Opcode = AMDGPUISD::BUFFER_ATOMIC_INC; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC; + break; default: llvm_unreachable("unhandled atomic opcode"); } @@ -6512,12 +6644,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6534,10 +6670,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7])); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6554,10 +6691,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(7), // soffset Offsets.second, // offset Op.getOperand(8), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7], + Ops[4])); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6686,23 +6825,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; return DAG.getNode(Opc, DL, Op->getVTList(), Ops); } - case Intrinsic::amdgcn_s_sendmsg: - case Intrinsic::amdgcn_s_sendmsghalt: { - unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ? - AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT; - Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); - SDValue Glue = Chain.getValue(1); - return DAG.getNode(NodeOp, DL, MVT::Other, Chain, - Op.getOperand(2), Glue); - } - case Intrinsic::amdgcn_init_exec: { - return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain, - Op.getOperand(2)); - } - case Intrinsic::amdgcn_init_exec_from_input: { - return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, - Op.getOperand(2), Op.getOperand(3)); - } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -6733,9 +6855,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(5), // voffset Op.getOperand(6), // soffset Op.getOperand(7), // offset - DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idexen + DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -6759,8 +6881,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // format - Op.getOperand(8), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idexen + Op.getOperand(8), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -6784,8 +6906,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idexen + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -6813,14 +6935,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -6833,10 +6959,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: { + const bool IsFormat = + IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format; + SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + EVT VDataVT = VData.getValueType(); + EVT EltType = VDataVT.getScalarType(); + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); if (IsD16) VData = handleD16VData(VData, DAG); + + if (!isTypeLegal(VDataVT)) { + VData = + DAG.getNode(ISD::BITCAST, DL, + getEquivalentMemType(*DAG.getContext(), VDataVT), VData); + } + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Chain, @@ -6846,18 +6984,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; - unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ? - AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + unsigned Opc = + IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics - EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) - return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) + return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); @@ -6865,10 +7003,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_struct_buffer_store: case Intrinsic::amdgcn_struct_buffer_store_format: { + const bool IsFormat = + IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format; + SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + EVT VDataVT = VData.getValueType(); + EVT EltType = VDataVT.getScalarType(); + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); + if (IsD16) VData = handleD16VData(VData, DAG); + + if (!isTypeLegal(VDataVT)) { + VData = + DAG.getNode(ISD::BITCAST, DL, + getEquivalentMemType(*DAG.getContext(), VDataVT), VData); + } + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { Chain, @@ -6878,17 +7029,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(6), // soffset Offsets.second, // offset - Op.getOperand(7), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], + Ops[3])); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) + if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, @@ -6908,13 +7061,17 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getOperand(2).getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD : AMDGPUISD::BUFFER_ATOMIC_FADD; @@ -6987,7 +7144,7 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( Overflow += ImmOffset; ImmOffset = 0; } - C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32)); + C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32)); if (Overflow) { auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32); if (!N0) @@ -7001,14 +7158,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( if (!N0) N0 = DAG.getConstant(0, DL, MVT::i32); if (!C1) - C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32)); + C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32)); return {N0, SDValue(C1, 0)}; } // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. -void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, +unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, unsigned Align) const { SDLoc DL(CombinedOffset); @@ -7018,8 +7175,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); - Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); - return; + Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); + return SOffset + ImmOffset; } } if (DAG.isBaseWithConstantOffset(CombinedOffset)) { @@ -7031,13 +7188,14 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Subtarget, Align)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); - Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); - return; + Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); + return 0; } } Offsets[0] = CombinedOffset; Offsets[1] = DAG.getConstant(0, DL, MVT::i32); - Offsets[2] = DAG.getConstant(0, DL, MVT::i32); + Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); + return 0; } // Handle 8 bit and 16 bit buffer loads @@ -7053,9 +7211,10 @@ SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, M->getMemOperand()); - SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, - LoadVT.getScalarType(), BufferLoad); - return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL); + SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad); + LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal); + + return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL); } // Handle 8 bit and 16 bit buffer stores @@ -7063,6 +7222,9 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, SDLoc DL, SDValue Ops[], MemSDNode *M) const { + if (VDataType == MVT::f16) + Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); Ops[1] = BufferStoreExt; unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : @@ -7215,8 +7377,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - *Load->getMemOperand())) { + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + MemVT, *Load->getMemOperand())) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); @@ -7505,6 +7667,19 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); } +// Returns immediate value for setting the F32 denorm mode when using the +// S_DENORM_MODE instruction. +static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, + const SDLoc &SL, const GCNSubtarget *ST) { + assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); + int DPDenormModeDefault = ST->hasFP64Denormals() + ? FP_DENORM_FLUSH_NONE + : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + int Mode = SPDenormMode | (DPDenormModeDefault << 2); + return DAG.getTargetConstant(Mode, SL, MVT::i32); +} + SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) return FastLowered; @@ -7531,16 +7706,26 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); - const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); if (!Subtarget->hasFP32Denormals()) { SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, - SL, MVT::i32); - SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, - DAG.getEntryNode(), - EnableDenormValue, BitField); + + SDValue EnableDenorm; + if (Subtarget->hasDenormModeInst()) { + const SDValue EnableDenormValue = + getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget); + + EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, + DAG.getEntryNode(), EnableDenormValue); + } else { + const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, + SL, MVT::i32); + EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, + DAG.getEntryNode(), EnableDenormValue, + BitField); + } + SDValue Ops[3] = { NegDivScale0, EnableDenorm.getValue(0), @@ -7562,19 +7747,29 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled, Mul); - SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); + SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2); SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3); if (!Subtarget->hasFP32Denormals()) { - const SDValue DisableDenormValue = - DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); - SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, - Fma4.getValue(1), - DisableDenormValue, - BitField, - Fma4.getValue(2)); + + SDValue DisableDenorm; + if (Subtarget->hasDenormModeInst()) { + const SDValue DisableDenormValue = + getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget); + + DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, + Fma4.getValue(1), DisableDenormValue, + Fma4.getValue(2)); + } else { + const SDValue DisableDenormValue = + DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); + + DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, + Fma4.getValue(1), DisableDenormValue, + BitField, Fma4.getValue(2)); + } SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, DisableDenorm, DAG.getRoot()); @@ -7684,8 +7879,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { assert(VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - *Store->getMemOperand())) { + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) { return expandUnalignedStore(Store, DAG); } @@ -10065,7 +10260,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have // to try understanding copies to physical registers. if (SrcVal.getValueType() == MVT::i1 && - TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) { + Register::isPhysicalRegister(DestReg->getReg())) { SDLoc SL(Node); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue VReg = DAG.getRegister( @@ -10218,7 +10413,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MachineOperand &Op = MI.getOperand(I); if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || - !TargetRegisterInfo::isVirtualRegister(Op.getReg()) || + !Register::isVirtualRegister(Op.getReg()) || !TRI->isAGPR(MRI, Op.getReg())) continue; auto *Src = MRI.getUniqueVRegDef(Op.getReg()); @@ -10256,7 +10451,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, Node->use_begin()->isMachineOpcode() && Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && !Node->use_begin()->hasAnyUseOfValue(0))) { - unsigned Def = MI.getOperand(0).getReg(); + Register Def = MI.getOperand(0).getReg(); // Change this into a noret atomic. MI.setDesc(TII->get(NoRetAtomicOp)); @@ -10300,7 +10495,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, // Combine the constants and the pointer. const SDValue Ops1[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr, DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi, @@ -10330,7 +10525,7 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), PtrLo, DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), PtrHi, @@ -10364,7 +10559,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, nullptr); case 32: case 16: - RC = &AMDGPU::SReg_32_XM0RegClass; + RC = &AMDGPU::SReg_32RegClass; break; case 64: RC = &AMDGPU::SGPR_64RegClass; @@ -10373,7 +10568,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, RC = &AMDGPU::SReg_96RegClass; break; case 128: - RC = &AMDGPU::SReg_128RegClass; + RC = &AMDGPU::SGPR_128RegClass; break; case 160: RC = &AMDGPU::SReg_160RegClass; @@ -10415,6 +10610,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } break; case 'a': + if (!Subtarget->hasMAIInsts()) + break; switch (VT.getSizeInBits()) { default: return std::make_pair(0U, nullptr); @@ -10548,9 +10745,9 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); } -unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { - const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML); - const unsigned CacheLineAlign = 6; // log2(64) +Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); + const Align CacheLineAlign = Align(64); // Pre-GFX10 target did not benefit from loop alignment if (!ML || DisableLoopAlignment || @@ -10578,7 +10775,7 @@ unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { // If inner loop block is aligned assume in average half of the alignment // size to be added as nops. if (MBB != Header) - LoopSize += (1 << MBB->getAlignment()) / 2; + LoopSize += MBB->getAlignment().value() / 2; for (const MachineInstr &MI : *MBB) { LoopSize += TII->getInstSizeInBytes(MI); @@ -10644,7 +10841,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, const MachineRegisterInfo &MRI = MF->getRegInfo(); const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); unsigned Reg = R->getReg(); - if (TRI.isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return !TRI.isSGPRReg(MRI, Reg); if (MRI.isLiveIn(Reg)) { @@ -10683,12 +10880,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, case ISD::INTRINSIC_W_CHAIN: return AMDGPU::isIntrinsicSourceOfDivergence( cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); - // In some cases intrinsics that are a source of divergence have been - // lowered to AMDGPUISD so we also need to check those too. - case AMDGPUISD::INTERP_MOV: - case AMDGPUISD::INTERP_P1: - case AMDGPUISD::INTERP_P2: - return true; } return false; } @@ -10748,3 +10939,110 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); } + +const TargetRegisterClass * +SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { + const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) + return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass + : &AMDGPU::SReg_32RegClass; + if (!TRI->isSGPRClass(RC) && !isDivergent) + return TRI->getEquivalentSGPRClass(RC); + else if (TRI->isSGPRClass(RC) && isDivergent) + return TRI->getEquivalentVGPRClass(RC); + + return RC; +} + +static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) { + if (!Visited.insert(V).second) + return false; + bool Result = false; + for (auto U : V->users()) { + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) { + if (V == U->getOperand(1)) { + switch (Intrinsic->getIntrinsicID()) { + default: + Result = false; + break; + case Intrinsic::amdgcn_if_break: + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: + Result = true; + break; + } + } + if (V == U->getOperand(0)) { + switch (Intrinsic->getIntrinsicID()) { + default: + Result = false; + break; + case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_loop: + Result = true; + break; + } + } + } else { + Result = hasCFUser(U, Visited); + } + if (Result) + break; + } + return Result; +} + +bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, + const Value *V) const { + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if_break: + return true; + } + } + if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) { + if (const IntrinsicInst *Intrinsic = + dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: { + ArrayRef<unsigned> Indices = ExtValue->getIndices(); + if (Indices.size() == 1 && Indices[0] == 1) { + return true; + } + } + } + } + } + if (const CallInst *CI = dyn_cast<CallInst>(V)) { + if (isa<InlineAsm>(CI->getCalledValue())) { + const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); + ImmutableCallSite CS(CI); + TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( + MF.getDataLayout(), Subtarget->getRegisterInfo(), CS); + for (auto &TC : TargetConstraints) { + if (TC.Type == InlineAsm::isOutput) { + ComputeConstraintToUse(TC, SDValue()); + unsigned AssignedReg; + const TargetRegisterClass *RC; + std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint( + SIRI, TC.ConstraintCode, TC.ConstraintVT); + if (RC) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg)) + return true; + else if (SIRI->isSGPRClass(RC)) + return true; + } + } + } + } + } + SmallPtrSet<const Value *, 16> Visited; + return hasCFUser(V, Visited); +} diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 21a215e16ce7..f0102feb65c4 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -94,6 +94,9 @@ private: SelectionDAG &DAG, ArrayRef<SDValue> Ops, bool IsIntrinsic = false) const; + SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG, + ArrayRef<SDValue> Ops) const; + // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to // dwordx4 if on SI. SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, @@ -183,6 +186,7 @@ private: unsigned isCFIntrinsic(const SDNode *Intr) const; +public: /// \returns True if fixup needs to be emitted for given global value \p GV, /// false otherwise. bool shouldEmitFixup(const GlobalValue *GV) const; @@ -195,11 +199,14 @@ private: /// global value \p GV, false otherwise. bool shouldEmitPCReloc(const GlobalValue *GV) const; +private: // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. - void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, - SDValue *Offsets, unsigned Align = 4) const; + /// \returns 0 If there is a non-constant offset or if the offset is 0. + /// Otherwise returns the constant offset. + unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, + SDValue *Offsets, unsigned Align = 4) const; // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, @@ -235,6 +242,11 @@ public: bool canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const override; + bool allowsMisalignedMemoryAccessesImpl( + unsigned Size, unsigned AS, unsigned Align, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + bool *IsFast = nullptr) const; + bool allowsMisalignedMemoryAccesses( EVT VT, unsigned AS, unsigned Align, MachineMemOperand::Flags Flags = MachineMemOperand::MONone, @@ -309,12 +321,13 @@ public: SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; MachineBasicBlock *splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const; + void bundleInstWithWaitcnt(MachineInstr &MI) const; MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const; @@ -330,6 +343,7 @@ public: bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const; + SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, @@ -374,7 +388,37 @@ public: unsigned Depth = 0) const override; AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; - unsigned getPrefLoopAlignment(MachineLoop *ML) const override; + virtual const TargetRegisterClass * + getRegClassFor(MVT VT, bool isDivergent) const override; + virtual bool requiresUniformRegister(MachineFunction &MF, + const Value *V) const override; + Align getPrefLoopAlignment(MachineLoop *ML) const override; + + void allocateHSAUserSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + + void allocateSystemSGPRs(CCState &CCInfo, + MachineFunction &MF, + SIMachineFunctionInfo &Info, + CallingConv::ID CallConv, + bool IsShader) const; + + void allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + void allocateSpecialInputSGPRs( + CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + + void allocateSpecialInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c89d5b71ec5c..dcb04e426584 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1483,12 +1483,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { if (BI.Incoming) { if (!Brackets) - Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming); + Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming); else *Brackets = *BI.Incoming; } else { if (!Brackets) - Brackets = llvm::make_unique<WaitcntBrackets>(ST); + Brackets = std::make_unique<WaitcntBrackets>(ST); else Brackets->clear(); } @@ -1508,7 +1508,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { if (!MoveBracketsToSucc) { MoveBracketsToSucc = &SuccBI; } else { - SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets); + SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets); } } else if (SuccBI.Incoming->merge(*Brackets)) { SuccBI.Dirty = true; diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index 561a16c3e351..4dcbe92861f2 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -124,6 +124,9 @@ class InstSI <dag outs, dag ins, string asm = "", // This bit indicates that this is one of MFMA instructions. field bit IsMAI = 0; + // This bit indicates that this is one of DOT instructions. + field bit IsDOT = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -189,6 +192,8 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{54} = IsMAI; + let TSFlags{55} = IsDOT; + let SchedRW = [Write32Bit]; field bits<1> DisableSIDecoder = 0; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index ba8ed6993a56..d97e6a62971b 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -318,8 +318,25 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, if (isMUBUF(LdSt) || isMTBUF(LdSt)) { const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); - if (SOffset && SOffset->isReg()) - return false; + if (SOffset && SOffset->isReg()) { + // We can only handle this if it's a stack access, as any other resource + // would require reporting multiple base registers. + const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + if (AddrReg && !AddrReg->isFI()) + return false; + + const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); + const SIMachineFunctionInfo *MFI + = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); + if (RSrc->getReg() != MFI->getScratchRSrcReg()) + return false; + + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); + BaseOp = SOffset; + Offset = OffsetImm->getImm(); + return true; + } const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (!AddrReg) @@ -458,9 +475,9 @@ bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, const MachineRegisterInfo &MRI = FirstLdSt.getParent()->getParent()->getRegInfo(); - const unsigned Reg = FirstDst->getReg(); + const Register Reg = FirstDst->getReg(); - const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg) + const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) : RI.getPhysRegClass(Reg); @@ -807,7 +824,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, "Not a VGPR32 reg"); if (Cond.size() == 1) { - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(Cond[0]); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) @@ -820,7 +837,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, assert(Cond[0].isImm() && "Cond[0] is not an immediate"); switch (Cond[0].getImm()) { case SIInstrInfo::SCC_TRUE: { - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64), SReg) .addImm(-1) @@ -834,7 +851,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, break; } case SIInstrInfo::SCC_FALSE: { - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64), SReg) .addImm(0) @@ -850,7 +867,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::VCCNZ: { MachineOperand RegOp = Cond[1]; RegOp.setImplicit(false); - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(RegOp); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) @@ -864,7 +881,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::VCCZ: { MachineOperand RegOp = Cond[1]; RegOp.setImplicit(false); - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) .add(RegOp); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) @@ -876,8 +893,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, break; } case SIInstrInfo::EXECNZ: { - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); - unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) .addImm(0); @@ -894,8 +911,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, break; } case SIInstrInfo::EXECZ: { - unsigned SReg = MRI.createVirtualRegister(BoolXExecRC); - unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); + Register SReg = MRI.createVirtualRegister(BoolXExecRC); + Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) .addImm(0); @@ -925,7 +942,7 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, const DebugLoc &DL, unsigned SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC()); + Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) .addImm(Value) .addReg(SrcReg); @@ -938,7 +955,7 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, const DebugLoc &DL, unsigned SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC()); + Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) .addImm(Value) .addReg(SrcReg); @@ -1052,12 +1069,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // The SGPR spill/restore instructions only work on number sgprs, so we need // to make sure we are using the correct register class. - if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { + if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } - MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) + BuildMI(MBB, MI, DL, OpDesc) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) @@ -1068,11 +1085,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // correctly handled. if (RI.spillSGPRToVGPR()) FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); - if (ST.hasScalarStores()) { - // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); - } - return; } @@ -1083,7 +1095,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, auto MIB = BuildMI(MBB, MI, DL, get(Opcode)); if (RI.hasAGPRs(RC)) { MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MIB.addReg(Tmp, RegState::Define); } MIB.addReg(SrcReg, getKillRegState(isKill)) // data @@ -1182,24 +1194,18 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); - if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { + if (Register::isVirtualRegister(DestReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } if (RI.spillSGPRToVGPR()) FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); - MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) + BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); - - if (ST.hasScalarStores()) { - // m0 is used for offset to scalar stores if used to spill. - Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); - } - return; } @@ -1208,7 +1214,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg); if (RI.hasAGPRs(RC)) { MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MIB.addReg(Tmp, RegState::Define); } MIB.addFrameIndex(FrameIndex) // vaddr @@ -1242,13 +1248,13 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && WorkGroupSize > WavefrontSize) { - unsigned TIDIGXReg - = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); - unsigned TIDIGYReg - = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); - unsigned TIDIGZReg - = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); - unsigned InputPtrReg = + Register TIDIGXReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); + Register TIDIGYReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); + Register TIDIGZReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + Register InputPtrReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) @@ -1410,9 +1416,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; case AMDGPU::V_MOV_B64_PSEUDO: { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); - unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + Register Dst = MI.getOperand(0).getReg(); + Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); const MachineOperand &SrcOp = MI.getOperand(1); // FIXME: Will this work for 64-bit floating point immediates? @@ -1437,6 +1443,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::V_MOV_B64_DPP_PSEUDO: { + expandMovDPP64(MI); + break; + } case AMDGPU::V_SET_INACTIVE_B32: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; @@ -1469,7 +1479,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::V_MOVRELD_B32_V8: case AMDGPU::V_MOVRELD_B32_V16: { const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); - unsigned VecReg = MI.getOperand(0).getReg(); + Register VecReg = MI.getOperand(0).getReg(); bool IsUndef = MI.getOperand(1).isUndef(); unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); assert(VecReg == MI.getOperand(1).getReg()); @@ -1492,9 +1502,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } case AMDGPU::SI_PC_ADD_REL_OFFSET: { MachineFunction &MF = *MBB.getParent(); - unsigned Reg = MI.getOperand(0).getReg(); - unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); - unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); + Register Reg = MI.getOperand(0).getReg(); + Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); + Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); // Create a bundle so these instructions won't be re-ordered by the // post-RA scheduler. @@ -1531,7 +1541,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; } case TargetOpcode::BUNDLE: { - if (!MI.mayLoad()) + if (!MI.mayLoad() || MI.hasUnmodeledSideEffects()) return false; // If it is a load it must be a memory clause @@ -1550,6 +1560,64 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } +std::pair<MachineInstr*, MachineInstr*> +SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { + assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MBB.findDebugLoc(MI); + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + Register Dst = MI.getOperand(0).getReg(); + unsigned Part = 0; + MachineInstr *Split[2]; + + + for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { + auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); + if (Dst.isPhysical()) { + MovDPP.addDef(RI.getSubReg(Dst, Sub)); + } else { + assert(MRI.isSSA()); + auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MovDPP.addDef(Tmp); + } + + for (unsigned I = 1; I <= 2; ++I) { // old and src operands. + const MachineOperand &SrcOp = MI.getOperand(I); + assert(!SrcOp.isFPImm()); + if (SrcOp.isImm()) { + APInt Imm(64, SrcOp.getImm()); + Imm.ashrInPlace(Part * 32); + MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); + } else { + assert(SrcOp.isReg()); + Register Src = SrcOp.getReg(); + if (Src.isPhysical()) + MovDPP.addReg(RI.getSubReg(Src, Sub)); + else + MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); + } + } + + for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) + MovDPP.addImm(MI.getOperand(I).getImm()); + + Split[Part] = MovDPP; + ++Part; + } + + if (Dst.isVirtual()) + BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) + .addReg(Split[0]->getOperand(0).getReg()) + .addImm(AMDGPU::sub0) + .addReg(Split[1]->getOperand(0).getReg()) + .addImm(AMDGPU::sub1); + + MI.eraseFromParent(); + return std::make_pair(Split[0], Split[1]); +} + bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, @@ -1574,7 +1642,7 @@ bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp) { - unsigned Reg = RegOp.getReg(); + Register Reg = RegOp.getReg(); unsigned SubReg = RegOp.getSubReg(); bool IsKill = RegOp.isKill(); bool IsDead = RegOp.isDead(); @@ -1646,7 +1714,8 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // This needs to be implemented because the source modifiers may be inserted // between the true commutable operands, and the base // TargetInstrInfo::commuteInstruction uses it. -bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, +bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const { return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); } @@ -1710,7 +1779,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, // FIXME: Virtual register workaround for RegScavenger not working with empty // blocks. - unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); auto I = MBB.end(); @@ -2163,7 +2232,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, SmallVector<unsigned, 8> Regs; for (int Idx = 0; Idx != NElts; ++Idx) { - unsigned DstElt = MRI.createVirtualRegister(EltRC); + Register DstElt = MRI.createVirtualRegister(EltRC); Regs.push_back(DstElt); unsigned SubIdx = SubIndices[Idx]; @@ -2327,7 +2396,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); - unsigned Src1Reg = Src1->getReg(); + Register Src1Reg = Src1->getReg(); unsigned Src1SubReg = Src1->getSubReg(); Src0->setReg(Src1Reg); Src0->setSubReg(Src1SubReg); @@ -2367,12 +2436,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MRI->hasOneUse(Src0->getReg())) { Src0->ChangeToImmediate(Def->getOperand(1).getImm()); Src0Inlined = true; - } else if ((RI.isPhysicalRegister(Src0->getReg()) && - (ST.getConstantBusLimit(Opc) <= 1 && - RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || - (RI.isVirtualRegister(Src0->getReg()) && - (ST.getConstantBusLimit(Opc) <= 1 && - RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) + } else if ((Register::isPhysicalRegister(Src0->getReg()) && + (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || + (Register::isVirtualRegister(Src0->getReg()) && + (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) return false; // VGPR is okay as Src0 - fallthrough } @@ -2385,10 +2454,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI)) { Src0->ChangeToImmediate(Def->getOperand(1).getImm()); - } else if ((RI.isPhysicalRegister(Src1->getReg()) && - RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || - (RI.isVirtualRegister(Src1->getReg()) && - RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + } else if ((Register::isPhysicalRegister(Src1->getReg()) && + RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || + (Register::isVirtualRegister(Src1->getReg()) && + RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) return false; // VGPR is okay as Src1 - fallthrough } @@ -2472,8 +2541,7 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, } bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA) const { + const MachineInstr &MIb) const { assert((MIa.mayLoad() || MIa.mayStore()) && "MIa must load from or modify a memory location"); assert((MIb.mayLoad() || MIb.mayStore()) && @@ -2664,6 +2732,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, MI.modifiesRegister(AMDGPU::EXEC, &RI) || MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || MI.getOpcode() == AMDGPU::S_SETREG_B32 || + MI.getOpcode() == AMDGPU::S_DENORM_MODE || changesVGPRIndexingMode(MI); } @@ -2865,8 +2934,16 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (OpInfo.RegClass < 0) return false; - if (MO.isImm() && isInlineConstant(MO, OpInfo)) + const MachineFunction *MF = MI.getParent()->getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + + if (MO.isImm() && isInlineConstant(MO, OpInfo)) { + if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && + OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::src2)) + return false; return RI.opCanUseInlineConstant(OpInfo.OperandType); + } if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) return false; @@ -2874,8 +2951,6 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) return true; - const MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); return ST.hasVOP3Literal(); } @@ -3036,7 +3111,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, if (!MO.isUse()) return false; - if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) + if (Register::isVirtualRegister(MO.getReg())) return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); // Null is free @@ -3093,7 +3168,8 @@ static bool shouldReadExec(const MachineInstr &MI) { return true; } - if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || + if (MI.isPreISelOpcode() || + SIInstrInfo::isGenericOpcode(MI.getOpcode()) || SIInstrInfo::isSALU(MI) || SIInstrInfo::isSMRD(MI)) return false; @@ -3104,7 +3180,7 @@ static bool shouldReadExec(const MachineInstr &MI) { static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg) { - if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) + if (Register::isPhysicalRegister(SubReg.getReg())) return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); return SubReg.getSubReg() != AMDGPU::NoSubRegister && @@ -3144,8 +3220,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (!Op.isReg()) continue; - unsigned Reg = Op.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { + Register Reg = Op.getReg(); + if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) { ErrInfo = "inlineasm operand has incorrect register class."; return false; } @@ -3209,9 +3285,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, continue; if (RegClass != -1) { - unsigned Reg = MI.getOperand(i).getReg(); - if (Reg == AMDGPU::NoRegister || - TargetRegisterInfo::isVirtualRegister(Reg)) + Register Reg = MI.getOperand(i).getReg(); + if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg)) continue; const TargetRegisterClass *RC = RI.getRegClass(RegClass); @@ -3304,7 +3379,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, ErrInfo = "Dst register should be tied to implicit use of preserved register"; return false; - } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) && + } else if (Register::isPhysicalRegister(TiedMO.getReg()) && Dst.getReg() != TiedMO.getReg()) { ErrInfo = "Dst register should use same physical register as preserved"; return false; @@ -3409,6 +3484,32 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + // Special case for writelane - this can break the multiple constant bus rule, + // but still can't use more than one SGPR register + if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { + unsigned SGPRCount = 0; + Register SGPRUsed = AMDGPU::NoRegister; + + for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { + if (OpIdx == -1) + break; + + const MachineOperand &MO = MI.getOperand(OpIdx); + + if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { + if (MO.isReg() && MO.getReg() != AMDGPU::M0) { + if (MO.getReg() != SGPRUsed) + ++SGPRCount; + SGPRUsed = MO.getReg(); + } + } + if (SGPRCount > ST.getConstantBusLimit(Opcode)) { + ErrInfo = "WRITELANE instruction violates constant bus restriction"; + return false; + } + } + } + // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { @@ -3609,7 +3710,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && ST.getGeneration() >= AMDGPUSubtarget::GFX10) { ErrInfo = "Invalid dpp_ctrl value: " - "broadcats are not supported on GFX10+"; + "broadcasts are not supported on GFX10+"; return false; } if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && @@ -3631,6 +3732,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::PHI: return AMDGPU::PHI; case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::WQM: return AMDGPU::WQM; + case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; case AMDGPU::WWM: return AMDGPU::WWM; case AMDGPU::S_MOV_B32: { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -3708,9 +3810,9 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, const MCInstrDesc &Desc = get(MI.getOpcode()); if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || Desc.OpInfo[OpNo].RegClass == -1) { - unsigned Reg = MI.getOperand(OpNo).getReg(); + Register Reg = MI.getOperand(OpNo).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return MRI.getRegClass(Reg); return RI.getPhysRegClass(Reg); } @@ -3741,7 +3843,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { else VRC = &AMDGPU::VGPR_32RegClass; - unsigned Reg = MRI.createVirtualRegister(VRC); + Register Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); MO.ChangeToRegister(Reg, false); @@ -3756,7 +3858,7 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, const { MachineBasicBlock *MBB = MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - unsigned SubReg = MRI.createVirtualRegister(SubRC); + Register SubReg = MRI.createVirtualRegister(SubRC); if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) @@ -3768,7 +3870,7 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, // value so we don't need to worry about merging its subreg index with the // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. - unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); + Register NewSuperReg = MRI.createVirtualRegister(SuperRC); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); @@ -3814,11 +3916,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, if (!MO.isReg()) return false; - unsigned Reg = MO.getReg(); - const TargetRegisterClass *RC = - TargetRegisterInfo::isVirtualRegister(Reg) ? - MRI.getRegClass(Reg) : - RI.getPhysRegClass(Reg); + Register Reg = MO.getReg(); + const TargetRegisterClass *RC = Register::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : RI.getPhysRegClass(Reg); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); @@ -3935,13 +4036,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, if (Opc == AMDGPU::V_WRITELANE_B32) { const DebugLoc &DL = MI.getDebugLoc(); if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) .add(Src0); Src0.ChangeToRegister(Reg, false); } if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); const DebugLoc &DL = MI.getDebugLoc(); BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) .add(Src1); @@ -3967,7 +4068,7 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // select is uniform. if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); const DebugLoc &DL = MI.getDebugLoc(); BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) .add(Src1); @@ -4003,7 +4104,7 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, MI.setDesc(get(CommutedOpc)); - unsigned Src0Reg = Src0.getReg(); + Register Src0Reg = Src0.getReg(); unsigned Src0SubReg = Src0.getSubReg(); bool Src0Kill = Src0.isKill(); @@ -4039,13 +4140,13 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); const DebugLoc &DL = MI.getDebugLoc(); if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) .add(Src1); Src1.ChangeToRegister(Reg, false); } if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) .add(Src2); Src2.ChangeToRegister(Reg, false); @@ -4113,12 +4214,12 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const { const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); - unsigned DstReg = MRI.createVirtualRegister(SRC); + Register DstReg = MRI.createVirtualRegister(SRC); unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; if (RI.hasAGPRs(VRC)) { VRC = RI.getEquivalentVGPRClass(VRC); - unsigned NewSrcReg = MRI.createVirtualRegister(VRC); + Register NewSrcReg = MRI.createVirtualRegister(VRC); BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(TargetOpcode::COPY), NewSrcReg) .addReg(SrcReg); @@ -4134,7 +4235,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, SmallVector<unsigned, 8> SRegs; for (unsigned i = 0; i < SubRegs; ++i) { - unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(AMDGPU::V_READFIRSTLANE_B32), SGPR) .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); @@ -4176,7 +4277,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const { - unsigned OpReg = Op.getReg(); + Register OpReg = Op.getReg(); unsigned OpSubReg = Op.getSubReg(); const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( @@ -4186,7 +4287,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, if (DstRC == OpRC) return; - unsigned DstReg = MRI.createVirtualRegister(DstRC); + Register DstReg = MRI.createVirtualRegister(DstRC); MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); @@ -4198,8 +4299,19 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, return; // Try to eliminate the copy if it is copying an immediate value. - if (Def->isMoveImmediate()) + if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) FoldImmediate(*Copy, *Def, OpReg, &MRI); + + bool ImpDef = Def->isImplicitDef(); + while (!ImpDef && Def && Def->isCopy()) { + if (Def->getOperand(1).getReg().isPhysical()) + break; + Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); + ImpDef = Def && Def->isImplicitDef(); + } + if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && + !ImpDef) + Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); } // Emit the actual waterfall loop, executing the wrapped instruction for each @@ -4223,18 +4335,18 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock::iterator I = LoopBB.begin(); - unsigned VRsrc = Rsrc.getReg(); + Register VRsrc = Rsrc.getReg(); unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); - unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); - unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC); - unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC); - unsigned AndCond = MRI.createVirtualRegister(BoolXExecRC); - unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); + Register AndCond = MRI.createVirtualRegister(BoolXExecRC); + Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); // Beginning of the loop, read the next Rsrc variant. BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) @@ -4302,7 +4414,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); // Save the EXEC mask BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); @@ -4370,10 +4482,10 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); // Create an empty resource descriptor - unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); // Zero64 = 0 @@ -4430,7 +4542,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { if (!MI.getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) + !Register::isVirtualRegister(MI.getOperand(i).getReg())) continue; const TargetRegisterClass *OpRC = MRI.getRegClass(MI.getOperand(i).getReg()); @@ -4447,8 +4559,16 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); - VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC) - : RI.getEquivalentVGPRClass(SRC); + if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { + VRC = &AMDGPU::VReg_1RegClass; + } else + VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) + ? RI.getEquivalentAGPRClass(SRC) + : RI.getEquivalentVGPRClass(SRC); + } else { + VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) + ? RI.getEquivalentAGPRClass(VRC) + : RI.getEquivalentVGPRClass(VRC); } RC = VRC; } else { @@ -4458,7 +4578,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, // Update all the operands so they have the same type. for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { MachineOperand &Op = MI.getOperand(I); - if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) continue; // MI is a PHI instruction. @@ -4483,7 +4603,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, // subregister index types e.g. sub0_sub1 + sub2 + sub3 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { MachineOperand &Op = MI.getOperand(I); - if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) continue; const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); @@ -4502,8 +4622,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, // Legalize INSERT_SUBREG // src0 must have the same register class as dst if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src0 = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); if (DstRC != Src0RC) { @@ -4577,13 +4697,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. - unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC); - unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); unsigned RsrcPtr, NewSRsrc; std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); @@ -4623,7 +4743,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, unsigned RsrcPtr, NewSRsrc; std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); - unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); @@ -4661,6 +4781,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, MIB.addImm(TFE->getImm()); } + MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); + MIB.cloneMemRefs(MI); Addr64 = MIB; } else { @@ -4933,8 +5055,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); unsigned NewDstReg = AMDGPU::NoRegister; if (HasDst) { - unsigned DstReg = Inst.getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + Register DstReg = Inst.getOperand(0).getReg(); + if (Register::isPhysicalRegister(DstReg)) continue; // Update the destination register class. @@ -4943,7 +5065,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, continue; if (Inst.isCopy() && - TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && + Register::isVirtualRegister(Inst.getOperand(1).getReg()) && NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { // Instead of creating a copy where src and dst are the same register // class, we just replace all uses of dst with src. These kinds of @@ -4988,8 +5110,8 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - unsigned OldDstReg = Inst.getOperand(0).getReg(); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register OldDstReg = Inst.getOperand(0).getReg(); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned Opc = Inst.getOpcode(); assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); @@ -5022,8 +5144,8 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src = Inst.getOperand(1); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned SubOp = ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; @@ -5052,7 +5174,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, MachineOperand &Src1 = Inst.getOperand(2); if (ST.hasDLInsts()) { - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); @@ -5072,8 +5194,8 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, bool Src1IsSGPR = Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); MachineInstr *Xor; - unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); // Build a pair of scalar instructions and add them to the work list. // The next iteration over the work list will lower these to the vector @@ -5117,8 +5239,8 @@ void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) .add(Src0) @@ -5146,8 +5268,8 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) .add(Src1); @@ -5189,16 +5311,16 @@ void SIInstrInfo::splitScalar64BitUnaryOp( const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); - unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); + Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); + Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); - unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); + Register FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) @@ -5226,12 +5348,12 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned CarryReg = MRI.createVirtualRegister(CarryRC); - unsigned DeadCarryReg = MRI.createVirtualRegister(CarryRC); + Register CarryReg = MRI.createVirtualRegister(CarryRC); + Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src0 = Inst.getOperand(1); @@ -5327,17 +5449,17 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); - unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); + Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) .add(SrcReg0Sub0) .add(SrcReg1Sub0); - unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); + Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) .add(SrcReg0Sub1) .add(SrcReg1Sub1); - unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); + Register FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) @@ -5368,7 +5490,7 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); MachineOperand* Op0; MachineOperand* Op1; @@ -5384,7 +5506,7 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) .add(*Op0); - unsigned NewDest = MRI.createVirtualRegister(DestRC); + Register NewDest = MRI.createVirtualRegister(DestRC); MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) .addReg(Interm) @@ -5411,8 +5533,8 @@ void SIInstrInfo::splitScalar64BitBCNT( MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass; - unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); @@ -5451,9 +5573,9 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, Offset == 0 && "Not implemented"); if (BitWidth < 32) { - unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) @@ -5476,8 +5598,8 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, } MachineOperand &Src = Inst.getOperand(1); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) .addImm(31) @@ -5506,6 +5628,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( switch (UseMI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::WQM: + case AMDGPU::SOFT_WQM: case AMDGPU::WWM: case AMDGPU::REG_SEQUENCE: case AMDGPU::PHI: @@ -5531,7 +5654,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const { - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); MachineBasicBlock *MBB = Inst.getParent(); MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); @@ -5539,8 +5662,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, switch (Inst.getOpcode()) { case AMDGPU::S_PACK_LL_B32_B16: { - unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); // FIXME: Can do a lot better if we know the high bits of src0 or src1 are // 0. @@ -5558,7 +5681,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, break; } case AMDGPU::S_PACK_LH_B32_B16: { - unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) .addImm(0xffff); BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) @@ -5568,8 +5691,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, break; } case AMDGPU::S_PACK_HH_B32_B16: { - unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) .addImm(16) .add(Src0); @@ -5623,17 +5746,27 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: + case AMDGPU::SOFT_WQM: case AMDGPU::WWM: { const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); if (RI.hasAGPRs(SrcRC)) { if (RI.hasAGPRs(NewDstRC)) return nullptr; - NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + switch (Inst.getOpcode()) { + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + break; + default: + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + } + if (!NewDstRC) return nullptr; } else { - if (RI.hasVGPRs(NewDstRC)) + if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) return nullptr; NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); @@ -5686,7 +5819,7 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, return MO.getReg(); // If this could be a VGPR or an SGPR, Check the dynamic register class. - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); if (RI.isSGPRClass(RegRC)) UsedSGPRs[i] = Reg; @@ -5941,7 +6074,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC()); + Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); MachineInstr *SIIF = BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) .add(Branch->getOperand(0)) @@ -5968,8 +6101,8 @@ void SIInstrInfo::convertNonUniformLoopRegion( if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { - unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC()); - unsigned BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); + Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); + Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); MachineInstrBuilder HeaderPHIBuilder = BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), @@ -5979,7 +6112,7 @@ void SIInstrInfo::convertNonUniformLoopRegion( HeaderPHIBuilder.addReg(BackEdgeReg); } else { MachineBasicBlock *PMBB = *PI; - unsigned ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); + Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), ZeroReg, 0); HeaderPHIBuilder.addReg(ZeroReg); @@ -6063,13 +6196,30 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - unsigned UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); + Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) .addReg(UnusedCarry, RegState::Define | RegState::Dead); } +MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register DestReg, + RegScavenger &RS) const { + if (ST.hasAddNoCarry()) + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); + + Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false); + // TODO: Users need to deal with this. + if (!UnusedCarry.isValid()) + return MachineInstrBuilder(); + + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead); +} + bool SIInstrInfo::isKillTerminator(unsigned Opcode) { switch (Opcode) { case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: @@ -6115,7 +6265,21 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { return false; const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; - return RCID == AMDGPU::SReg_128RegClassID; + return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); +} + +unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace, + bool Signed) const { + if (!ST.hasFlatInstOffsets()) + return 0; + + if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) + return 0; + + if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) + return Signed ? 12 : 11; + + return Signed ? 13 : 12; } bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, @@ -6254,7 +6418,7 @@ static bool followSubRegDef(MachineInstr &MI, MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI) { assert(MRI.isSSA()); - if (!TargetRegisterInfo::isVirtualRegister(P.Reg)) + if (!Register::isVirtualRegister(P.Reg)) return nullptr; auto RSR = P; @@ -6265,8 +6429,7 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, case AMDGPU::COPY: case AMDGPU::V_MOV_B32_e32: { auto &Op1 = MI->getOperand(1); - if (Op1.isReg() && - TargetRegisterInfo::isVirtualRegister(Op1.getReg())) { + if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) { if (Op1.isUndef()) return nullptr; RSR = getRegSubRegPair(Op1); @@ -6360,3 +6523,40 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, return true; } } + +MachineInstr *SIInstrInfo::createPHIDestinationCopy( + MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, + const DebugLoc &DL, Register Src, Register Dst) const { + auto Cur = MBB.begin(); + if (Cur != MBB.end()) + do { + if (!Cur->isPHI() && Cur->readsRegister(Dst)) + return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); + ++Cur; + } while (Cur != MBB.end() && Cur != LastPHIIt); + + return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, + Dst); +} + +MachineInstr *SIInstrInfo::createPHISourceCopy( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, Register SrcSubReg, Register Dst) const { + if (InsPt != MBB.end() && + (InsPt->getOpcode() == AMDGPU::SI_IF || + InsPt->getOpcode() == AMDGPU::SI_ELSE || + InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && + InsPt->definesRegister(Src)) { + InsPt++; + return BuildMI(MBB, InsPt, InsPt->getDebugLoc(), + get(ST.isWave32() ? AMDGPU::S_MOV_B32_term + : AMDGPU::S_MOV_B64_term), + Dst) + .addReg(Src, 0, SrcSubReg) + .addReg(AMDGPU::EXEC, RegState::Implicit); + } + return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, + Dst); +} + +bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 3ff35da0b963..be463442c888 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -173,7 +173,7 @@ public: } bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AliasAnalysis *AA) const override; + AAResults *AA) const override; bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, @@ -229,6 +229,14 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; + // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp + // instructions. Returns a pair of generated instructions. + // Can split either post-RA with physical registers or pre-RA with + // virtual registers. In latter case IR needs to be in SSA form and + // and a REG_SEQUENCE is produced to define original register. + std::pair<MachineInstr*, MachineInstr*> + expandMovDPP64(MachineInstr &MI) const; + // Returns an opcode that can be used to move a value to a \p DstRC // register. If there is no hardware instruction that can store to \p // DstRC, then AMDGPU::COPY is returned. @@ -242,7 +250,7 @@ public: return commuteOpcode(MI.getOpcode()); } - bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, + bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; bool findCommutedOpIndices(MCInstrDesc Desc, unsigned & SrcOpIdx0, @@ -303,8 +311,7 @@ public: bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA = nullptr) const override; + const MachineInstr &MIb) const override; bool isFoldableCopy(const MachineInstr &MI) const; @@ -578,6 +585,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::IsMAI; } + static bool isDOT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsDOT; + } + + bool isDOT(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsDOT; + } + static bool isScalarUnit(const MachineInstr &MI) { return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD); } @@ -954,6 +969,19 @@ public: bool isBasicBlockPrologue(const MachineInstr &MI) const override; + MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, + Register Dst) const override; + + MachineInstr *createPHISourceCopy(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, + Register SrcSubReg, + Register Dst) const override; + + bool isWave32() const; + /// Return a partially built integer add instruction without carry. /// Caller must add source operands. /// For pre-GFX9 it will generate unused carry destination operand. @@ -963,6 +991,12 @@ public: const DebugLoc &DL, unsigned DestReg) const; + MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register DestReg, + RegScavenger &RS) const; + static bool isKillTerminator(unsigned Opcode); const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const; @@ -970,6 +1004,8 @@ public: return isUInt<12>(Imm); } + unsigned getNumFlatOffsetBits(unsigned AddrSpace, bool Signed) const; + /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT /// encoded instruction. If \p Signed, this is for an instruction that /// interprets the offset as signed. diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index c382c816e0b4..1eecbf555613 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -84,7 +84,7 @@ def SDTtbuffer_load : SDTypeProfile<1, 8, SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // format(imm) - SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<8, i1> // idxen(imm) ]>; @@ -102,7 +102,7 @@ def SDTtbuffer_store : SDTypeProfile<0, 9, SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // format(imm) - SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<8, i1> // idxen(imm) ]>; @@ -119,7 +119,7 @@ def SDTBufferLoad : SDTypeProfile<1, 7, SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, @@ -145,7 +145,7 @@ def SDTBufferStore : SDTypeProfile<0, 8, SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, @@ -198,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">; def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">; def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; +def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; +def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>; def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>; @@ -264,6 +266,11 @@ def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8", [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] >; +def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE", + SDTypeProfile<0 ,1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue] +>; + //===----------------------------------------------------------------------===// // ValueType helpers //===----------------------------------------------------------------------===// @@ -277,7 +284,9 @@ class isFloatType<ValueType SrcVT> { !if(!eq(SrcVT.Value, f64.Value), 1, !if(!eq(SrcVT.Value, v2f16.Value), 1, !if(!eq(SrcVT.Value, v4f16.Value), 1, - 0))))); + !if(!eq(SrcVT.Value, v2f32.Value), 1, + !if(!eq(SrcVT.Value, v2f64.Value), 1, + 0))))))); } class isIntType<ValueType SrcVT> { @@ -300,14 +309,36 @@ class isPackedType<ValueType SrcVT> { // PatFrags for global memory operations //===----------------------------------------------------------------------===// -defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>; -defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>; +foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { +let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { + -def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>; -def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>; -def atomic_load_fadd_local : local_binary_atomic_op<atomic_load_fadd>; -def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>; -def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>; +defm atomic_inc_#as : binary_atomic_op<SIatomic_inc>; +defm atomic_dec_#as : binary_atomic_op<SIatomic_dec>; +defm atomic_load_fmin_#as : binary_atomic_op<SIatomic_fmin, 0>; +defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>; + + +} // End let AddressSpaces = ... +} // End foreach AddrSpace + +def atomic_fadd_global_noret : PatFrag< + (ops node:$ptr, node:$value), + (SIglobal_atomic_fadd node:$ptr, node:$value)> { + // FIXME: Move this + let MemoryVT = f32; + let IsAtomic = 1; + let AddressSpaces = StoreAddress_global.AddrSpaces; +} + +def atomic_pk_fadd_global_noret : PatFrag< + (ops node:$ptr, node:$value), + (SIglobal_atomic_pk_fadd node:$ptr, node:$value)> { + // FIXME: Move this + let MemoryVT = v2f16; + let IsAtomic = 1; + let AddressSpaces = StoreAddress_global.AddrSpaces; +} //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. @@ -328,10 +359,12 @@ def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad, >; def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr)> { + let IsLoad = 1; let IsUnindexed = 1; } def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> { + let IsLoad = 1; let IsNonExtLoad = 1; } @@ -347,14 +380,15 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr), let MemoryVT = i64; } -def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> { +def extload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> { let IsLoad = 1; let IsAnyExtLoad = 1; } -def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{ - return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; -}]>; +def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> { + let IsLoad = 1; + let IsSignExtLoad = 1; +} def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> { let IsLoad = 1; @@ -391,25 +425,50 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> { let MemoryVT = i16; } -def load_glue_align8 : Aligned8Bytes < - (ops node:$ptr), (load_glue node:$ptr) ->; -def load_glue_align16 : Aligned16Bytes < - (ops node:$ptr), (load_glue node:$ptr) ->; +let IsLoad = 1, AddressSpaces = LoadAddress_local.AddrSpaces in { +def load_local_m0 : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> { + let IsNonExtLoad = 1; +} -def load_local_m0 : LoadFrag<load_glue>, LocalAddress; -def sextloadi8_local_m0 : LoadFrag<sextloadi8_glue>, LocalAddress; -def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress; -def extloadi8_local_m0 : LoadFrag<extloadi8_glue>, LocalAddress; -def zextloadi8_local_m0 : LoadFrag<zextloadi8_glue>, LocalAddress; -def extloadi16_local_m0 : LoadFrag<extloadi16_glue>, LocalAddress; -def zextloadi16_local_m0 : LoadFrag<zextloadi16_glue>, LocalAddress; -def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress; -def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress; -def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress; -def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress; +let MemoryVT = i8 in { +def extloadi8_local_m0 : PatFrag<(ops node:$ptr), (extloadi8_glue node:$ptr)>; +def sextloadi8_local_m0 : PatFrag<(ops node:$ptr), (sextloadi8_glue node:$ptr)>; +def zextloadi8_local_m0 : PatFrag<(ops node:$ptr), (zextloadi8_glue node:$ptr)>; +} + +let MemoryVT = i16 in { +def extloadi16_local_m0 : PatFrag<(ops node:$ptr), (extloadi16_glue node:$ptr)>; +def sextloadi16_local_m0 : PatFrag<(ops node:$ptr), (sextloadi16_glue node:$ptr)>; +def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)>; +} + +def load_align8_local_m0 : PatFrag<(ops node:$ptr), + (load_local_m0 node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; + let MinAlignment = 8; +} +def load_align16_local_m0 : PatFrag<(ops node:$ptr), + (load_local_m0 node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; + let MinAlignment = 16; +} + +} // End IsLoad = 1 + +let IsAtomic = 1, AddressSpaces = LoadAddress_local.AddrSpaces in { +def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr), + (atomic_load_32_glue node:$ptr)> { + let MemoryVT = i32; +} +def atomic_load_64_local_m0 : PatFrag<(ops node:$ptr), + (atomic_load_64_glue node:$ptr)> { + let MemoryVT = i64; +} + +} // End let AddressSpaces = LoadAddress_local.AddrSpaces def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore, @@ -420,50 +479,88 @@ def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] >; -def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val), - (AMDGPUatomic_st_glue node:$ptr, node:$val)> { -} - def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr), - (AMDGPUst_glue node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED; -}]>; + (AMDGPUst_glue node:$val, node:$ptr)> { + let IsStore = 1; + let IsUnindexed = 1; +} def store_glue : PatFrag<(ops node:$val, node:$ptr), - (unindexedstore_glue node:$val, node:$ptr), [{ - return !cast<StoreSDNode>(N)->isTruncatingStore(); -}]>; + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; +} def truncstore_glue : PatFrag<(ops node:$val, node:$ptr), - (unindexedstore_glue node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->isTruncatingStore(); -}]>; + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 1; +} def truncstorei8_glue : PatFrag<(ops node:$val, node:$ptr), - (truncstore_glue node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; + (truncstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i8; +} def truncstorei16_glue : PatFrag<(ops node:$val, node:$ptr), - (truncstore_glue node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; + (truncstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i16; +} -def store_glue_align8 : Aligned8Bytes < - (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr) ->; +let IsStore = 1, AddressSpaces = StoreAddress_local.AddrSpaces in { +def store_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (store_glue node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; +} -def store_glue_align16 : Aligned16Bytes < - (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr) ->; +def truncstorei8_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i8; +} + +def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i16; +} +} + +def store_align16_local_m0 : PatFrag < + (ops node:$value, node:$ptr), + (store_local_m0 node:$value, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; + let MinAlignment = 16; +} -def store_local_m0 : StoreFrag<store_glue>, LocalAddress; -def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress; -def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress; -def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress; +def store_align8_local_m0 : PatFrag < + (ops node:$value, node:$ptr), + (store_local_m0 node:$value, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; + let MinAlignment = 8; +} + +let AddressSpaces = StoreAddress_local.AddrSpaces in { + +def atomic_store_local_32_m0 : PatFrag < + (ops node:$value, node:$ptr), + (AMDGPUatomic_st_glue node:$value, node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i32; +} +def atomic_store_local_64_m0 : PatFrag < + (ops node:$value, node:$ptr), + (AMDGPUatomic_st_glue node:$value, node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i64; +} +} // End let AddressSpaces = StoreAddress_local.AddrSpaces -def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress; -def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress; def si_setcc_uniform : PatFrag < (ops node:$lhs, node:$rhs, node:$cond), @@ -539,16 +636,27 @@ def lshl_rev : PatFrag < (shl $src0, $src1) >; +def add_ctpop : PatFrag < + (ops node:$src0, node:$src1), + (add (ctpop $src0), $src1) +>; + multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, - SDTypeProfile tc = SDTAtomic2> { + SDTypeProfile tc = SDTAtomic2, + bit IsInt = 1> { def _glue : SDNode < !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; - def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; - def _region_m0 : region_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + let AddressSpaces = StoreAddress_local.AddrSpaces in { + defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; + } + + let AddressSpaces = StoreAddress_region.AddrSpaces in { + defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; + } } defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; @@ -563,17 +671,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; -defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32>; -defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>; -defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>; - -def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, - [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] ->; - -def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>; -def atomic_cmp_swap_region_m0 : AtomicCmpSwapRegion<atomic_cmp_swap_glue>; - +defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>; +defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>; +defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>; def as_i1imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); @@ -591,6 +691,10 @@ def as_i32imm: SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); }]>; +def as_i32timm: SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); +}]>; + def as_i64imm: SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); }]>; @@ -627,9 +731,13 @@ def SIMM16bit : ImmLeaf <i32, >; def UIMM16bit : ImmLeaf <i32, - [{return isUInt<16>(Imm); }] + [{return isUInt<16>(Imm);}] >; +def i64imm_32bit : ImmLeaf<i64, [{ + return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm); +}]>; + class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ return isInlineImmediate(N); }]>; @@ -763,6 +871,18 @@ def ExpTgtMatchClass : AsmOperandClass { let RenderMethod = "printExpTgt"; } +def SWaitMatchClass : AsmOperandClass { + let Name = "SWaitCnt"; + let RenderMethod = "addImmOperands"; + let ParserMethod = "parseSWaitCntOps"; +} + +def VReg32OrOffClass : AsmOperandClass { + let Name = "VReg32OrOff"; + let ParserMethod = "parseVReg32OrOff"; +} + +let OperandType = "OPERAND_IMMEDIATE" in { def SendMsgImm : Operand<i32> { let PrintMethod = "printSendMsg"; let ParserMatchClass = SendMsgMatchClass; @@ -778,22 +898,11 @@ def EndpgmImm : Operand<i16> { let ParserMatchClass = EndpgmMatchClass; } -def SWaitMatchClass : AsmOperandClass { - let Name = "SWaitCnt"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseSWaitCntOps"; -} - -def VReg32OrOffClass : AsmOperandClass { - let Name = "VReg32OrOff"; - let ParserMethod = "parseVReg32OrOff"; -} - def WAIT_FLAG : Operand <i32> { let ParserMatchClass = SWaitMatchClass; let PrintMethod = "printWaitFlag"; - let OperandType = "OPERAND_IMMEDIATE"; } +} // End OperandType = "OPERAND_IMMEDIATE" include "SIInstrFormats.td" include "VIInstrFormats.td" @@ -929,6 +1038,7 @@ def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>; def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; +def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>; def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>; @@ -1317,18 +1427,6 @@ class getVALUDstForVT<ValueType VT> { VOPDstS64orS32)))); // else VT == i1 } -// Returns true if VT is floating point. -class getIsFP<ValueType VT> { - bit ret = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, v2f16.Value), 1, - !if(!eq(VT.Value, v4f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, v2f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - !if(!eq(VT.Value, v2f64.Value), 1, - 0))))))); -} - // Returns the register class to use for the destination of VOP[12C] // instructions with SDWA extension class getSDWADstForVT<ValueType VT> { @@ -1340,7 +1438,7 @@ class getSDWADstForVT<ValueType VT> { // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { - bit isFP = getIsFP<VT>.ret; + bit isFP = isFloatType<VT>.ret; RegisterOperand ret = !if(isFP, @@ -1373,11 +1471,14 @@ class getVOPSrc0ForVT<ValueType VT> { // Returns the vreg register class to use for source operand given VT class getVregSrcForVT<ValueType VT> { RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128, - !if(!eq(VT.Size, 64), VReg_64, VGPR_32)); + !if(!eq(VT.Size, 96), VReg_96, + !if(!eq(VT.Size, 64), VReg_64, + !if(!eq(VT.Size, 48), VReg_64, + VGPR_32)))); } class getSDWASrcForVT <ValueType VT> { - bit isFP = getIsFP<VT>.ret; + bit isFP = isFloatType<VT>.ret; RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32); RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32); RegisterOperand ret = !if(isFP, retFlt, retInt); @@ -1386,7 +1487,7 @@ class getSDWASrcForVT <ValueType VT> { // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { - bit isFP = getIsFP<VT>.ret; + bit isFP = isFloatType<VT>.ret; RegisterOperand ret = !if(!eq(VT.Size, 128), VSrc_128, @@ -1433,7 +1534,7 @@ class isModifierType<ValueType SrcVT> { // Return type of input modifiers operand for specified input operand class getSrcMod <ValueType VT, bit EnableF32SrcMods> { - bit isFP = getIsFP<VT>.ret; + bit isFP = isFloatType<VT>.ret; bit isPacked = isPackedType<VT>.ret; Operand ret = !if(!eq(VT.Size, 64), !if(isFP, FP64InputMods, Int64InputMods), @@ -1452,7 +1553,7 @@ class getOpSelMod <ValueType VT> { // Return type of input modifiers operand specified input operand for DPP class getSrcModExt <ValueType VT> { - bit isFP = getIsFP<VT>.ret; + bit isFP = isFloatType<VT>.ret; Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } @@ -2038,6 +2139,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, field int NeedPatGen = PatGenMode.NoPattern; field bit IsMAI = 0; + field bit IsDOT = 0; field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 70f20bb69370..21984c6ad910 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -43,8 +43,8 @@ multiclass V_INTERP_P1_F32_m : VINTRP_m < (outs VINTRPDst:$vdst), (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan", - [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan), - (i32 imm:$attr)))] + [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc, + (i32 timm:$attrchan), (i32 timm:$attr), M0))] >; let OtherPredicates = [has32BankLDS] in { @@ -66,8 +66,8 @@ defm V_INTERP_P2_F32 : VINTRP_m < (outs VINTRPDst:$vdst), (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan", - [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan), - (i32 imm:$attr)))]>; + [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, + (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; } // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" @@ -76,8 +76,8 @@ defm V_INTERP_MOV_F32 : VINTRP_m < (outs VINTRPDst:$vdst), (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan), "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", - [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan), - (i32 imm:$attr)))]>; + [(set f32:$vdst, (int_amdgcn_interp_mov (i32 imm:$vsrc), + (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; } // End Uses = [M0, EXEC] @@ -92,6 +92,11 @@ def ATOMIC_FENCE : SPseudoInstSI< let maybeAtomic = 1; } +def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> { + let HasExt = 1; + let HasExtDPP = 1; +} + let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns @@ -107,10 +112,19 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)>; +// 64-bit vector move with dpp. Expanded post-RA. +def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> { + let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. +} + // Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the // WQM pass processes it. def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is +// turned into a copy by WQM pass, but does not seed WQM requirements. +def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; + // Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so // that the @earlyclobber is respected. The @earlyclobber is to make sure that // the instruction that defines $src0 (which is run in WWM) doesn't @@ -345,13 +359,15 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { } def SI_INIT_EXEC : SPseudoInstSI < - (outs), (ins i64imm:$src), []> { + (outs), (ins i64imm:$src), + [(int_amdgcn_init_exec (i64 timm:$src))]> { let Defs = [EXEC]; let usesCustomInserter = 1; let isAsCheapAsAMove = 1; let WaveSizePredicate = isWave64; } +// FIXME: Intrinsic should be mangled for wave size. def SI_INIT_EXEC_LO : SPseudoInstSI < (outs), (ins i32imm:$src), []> { let Defs = [EXEC_LO]; @@ -360,12 +376,20 @@ def SI_INIT_EXEC_LO : SPseudoInstSI < let WaveSizePredicate = isWave32; } +// FIXME: Wave32 version def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < - (outs), (ins SSrc_b32:$input, i32imm:$shift), []> { + (outs), (ins SSrc_b32:$input, i32imm:$shift), + [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { let Defs = [EXEC]; let usesCustomInserter = 1; } +def : GCNPat < + (int_amdgcn_init_exec timm:$src), + (SI_INIT_EXEC_LO (as_i32imm imm:$src))> { + let WaveSizePredicate = isWave32; +} + // Return for returning shaders to a shader variant epilog. def SI_RETURN_TO_EPILOG : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { @@ -604,25 +628,6 @@ def : GCNPat < (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) >; -def : GCNPat < - (AMDGPUinit_exec i64:$src), - (SI_INIT_EXEC (as_i64imm $src)) -> { - let WaveSizePredicate = isWave64; -} - -def : GCNPat < - (AMDGPUinit_exec i64:$src), - (SI_INIT_EXEC_LO (as_i32imm $src)) -> { - let WaveSizePredicate = isWave32; -} - -def : GCNPat < - (AMDGPUinit_exec_from_input i32:$input, i32:$shift), - (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift)) ->; - def : GCNPat< (AMDGPUtrap timm:$trapid), (S_TRAP $trapid) @@ -740,22 +745,22 @@ def : GCNPat < def : GCNPat < (i32 (fp_to_sint f16:$src)), - (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src)) + (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src)) >; def : GCNPat < (i32 (fp_to_uint f16:$src)), - (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src)) + (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src)) >; def : GCNPat < (f16 (sint_to_fp i32:$src)), - (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src)) + (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 VSrc_b32:$src)) >; def : GCNPat < (f16 (uint_to_fp i32:$src)), - (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src)) + (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 VSrc_b32:$src)) >; //===----------------------------------------------------------------------===// @@ -808,8 +813,14 @@ def : GCNPat < (V_BCNT_U32_B32_e64 $popcnt, $val) >; } + def : GCNPat < - (i16 (add (i16 (trunc (getDivergentFrag<ctpop>.ret i32:$popcnt))), i16:$val)), + (i32 (ctpop i32:$popcnt)), + (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0)) +>; + +def : GCNPat < + (i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)), (V_BCNT_U32_B32_e64 $popcnt, $val) >; @@ -1076,53 +1087,158 @@ def : GCNPat < /********** ================================ **********/ // Prevent expanding both fneg and fabs. +// TODO: Add IgnoredBySelectionDAG bit? +let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG def : GCNPat < - (fneg (fabs f32:$src)), - (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit + (fneg (fabs (f32 SReg_32:$src))), + (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit >; -// FIXME: Should use S_OR_B32 def : GCNPat < - (fneg (fabs f64:$src)), - (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), - sub0, - (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), - (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. - sub1) + (fabs (f32 SReg_32:$src)), + (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff))) +>; + +def : GCNPat < + (fneg (f32 SReg_32:$src)), + (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) +>; + +def : GCNPat < + (fneg (f16 SReg_32:$src)), + (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) +>; + +def : GCNPat < + (fneg (f16 VGPR_32:$src)), + (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) +>; + +def : GCNPat < + (fabs (f16 SReg_32:$src)), + (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff))) +>; + +def : GCNPat < + (fneg (fabs (f16 SReg_32:$src))), + (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit +>; + +def : GCNPat < + (fneg (fabs (f16 VGPR_32:$src))), + (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit +>; + +def : GCNPat < + (fneg (v2f16 SReg_32:$src)), + (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) +>; + +def : GCNPat < + (fabs (v2f16 SReg_32:$src)), + (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) +>; + +// This is really (fneg (fabs v2f16:$src)) +// +// fabs is not reported as free because there is modifier for it in +// VOP3P instructions, so it is turned into the bit op. +def : GCNPat < + (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), + (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; def : GCNPat < - (fabs f32:$src), - (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff))) + (fneg (v2f16 (fabs SReg_32:$src))), + (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit +>; + +// FIXME: The implicit-def of scc from S_[X]OR_B32 is mishandled + // def : GCNPat < +// (fneg (f64 SReg_64:$src)), +// (REG_SEQUENCE SReg_64, +// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), +// sub0, +// (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), +// (i32 (S_MOV_B32 (i32 0x80000000)))), +// sub1) +// >; + +// def : GCNPat < +// (fneg (fabs (f64 SReg_64:$src))), +// (REG_SEQUENCE SReg_64, +// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), +// sub0, +// (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), +// (S_MOV_B32 (i32 0x80000000))), // Set sign bit. +// sub1) +// >; + +} // End let AddedComplexity = 1 + +def : GCNPat < + (fabs (f32 VGPR_32:$src)), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src) +>; + +def : GCNPat < + (fneg (f32 VGPR_32:$src)), + (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) +>; + +def : GCNPat < + (fabs (f16 VGPR_32:$src)), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) >; def : GCNPat < - (fneg f32:$src), - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000))) + (fneg (v2f16 VGPR_32:$src)), + (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) >; def : GCNPat < - (fabs f64:$src), + (fabs (v2f16 VGPR_32:$src)), + (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) +>; + +def : GCNPat < + (fneg (v2f16 (fabs VGPR_32:$src))), + (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit +>; + +def : GCNPat < + (fabs (f64 VReg_64:$src)), (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), sub0, - (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit. sub1) >; +// TODO: Use SGPR for constant def : GCNPat < - (fneg f64:$src), + (fneg (f64 VReg_64:$src)), (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), sub0, - (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), (i32 (V_MOV_B32_e32 (i32 0x80000000)))), sub1) >; +// TODO: Use SGPR for constant +def : GCNPat < + (fneg (fabs (f64 VReg_64:$src))), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), + sub0, + (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), + (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. + sub1) +>; + def : GCNPat < (fcopysign f16:$src0, f16:$src1), (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) @@ -1154,45 +1270,6 @@ def : GCNPat < (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) >; -def : GCNPat < - (fneg f16:$src), - (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000))) ->; - -def : GCNPat < - (fabs f16:$src), - (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff))) ->; - -def : GCNPat < - (fneg (fabs f16:$src)), - (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit ->; - -def : GCNPat < - (fneg v2f16:$src), - (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000))) ->; - -def : GCNPat < - (fabs v2f16:$src), - (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff))) ->; - -// This is really (fneg (fabs v2f16:$src)) -// -// fabs is not reported as free because there is modifier for it in -// VOP3P instructions, so it is turned into the bit op. -def : GCNPat < - (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))), - (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit ->; - -def : GCNPat < - (fneg (v2f16 (fabs v2f16:$src))), - (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit ->; - /********** ================== **********/ /********** Immediate Patterns **********/ /********** ================== **********/ @@ -1544,7 +1621,7 @@ def : GCNPat < (V_CVT_F16_F32_e32 ( V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), - $src)) + SSrc_i1:$src)) >; def : GCNPat < @@ -1552,35 +1629,35 @@ def : GCNPat < (V_CVT_F16_F32_e32 ( V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), - $src)) + SSrc_i1:$src)) >; def : GCNPat < (f32 (sint_to_fp i1:$src)), (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), - $src) + SSrc_i1:$src) >; def : GCNPat < (f32 (uint_to_fp i1:$src)), (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), - $src) + SSrc_i1:$src) >; def : GCNPat < (f64 (sint_to_fp i1:$src)), (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 -1), - $src)) + SSrc_i1:$src)) >; def : GCNPat < (f64 (uint_to_fp i1:$src)), (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 1), - $src)) + SSrc_i1:$src)) >; //===----------------------------------------------------------------------===// @@ -1788,6 +1865,22 @@ def : GCNPat < (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) >; +def : GCNPat < + (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, + timm:$bound_ctrl)), + (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl)) +>; + +def : GCNPat < + (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, + timm:$bank_mask, timm:$bound_ctrl)), + (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl)) +>; + //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// @@ -1915,3 +2008,13 @@ def : FP16Med3Pat<f16, V_MED3_F16>; defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>; defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>; } // End Predicates = [isGFX9Plus] + +class AMDGPUGenericInstruction : GenericInstruction { + let Namespace = "AMDGPU"; +} + +def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src); + let hasSideEffects = 0; +} diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index ae8b967893a2..20db1c37f354 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -42,10 +42,7 @@ // // Future improvements: // -// - This currently relies on the scheduler to place loads and stores next to -// each other, and then only merges adjacent pairs of instructions. It would -// be good to be more flexible with interleaved instructions, and possibly run -// before scheduling. It currently missing stores of constants because loading +// - This is currently missing stores of constants because loading // the constant into the data register is placed between the stores, although // this is arguably a scheduling problem. // @@ -98,14 +95,9 @@ enum InstClassEnum { DS_READ, DS_WRITE, S_BUFFER_LOAD_IMM, - BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN, - BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET, - BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN, - BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET, - BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact, - BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact, - BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact, - BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact, + BUFFER_LOAD, + BUFFER_STORE, + MIMG, }; enum RegisterEnum { @@ -114,6 +106,7 @@ enum RegisterEnum { SOFFSET = 0x4, VADDR = 0x8, ADDR = 0x10, + SSAMP = 0x20, }; class SILoadStoreOptimizer : public MachineFunctionPass { @@ -126,6 +119,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { unsigned Width0; unsigned Width1; unsigned BaseOff; + unsigned DMask0; + unsigned DMask1; InstClassEnum InstClass; bool GLC0; bool GLC1; @@ -135,6 +130,60 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool DLC1; bool UseST64; SmallVector<MachineInstr *, 8> InstsToMove; + int AddrIdx[5]; + const MachineOperand *AddrReg[5]; + unsigned NumAddresses; + + bool hasSameBaseAddress(const MachineInstr &MI) { + for (unsigned i = 0; i < NumAddresses; i++) { + const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); + + if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { + if (AddrReg[i]->isImm() != AddrRegNext.isImm() || + AddrReg[i]->getImm() != AddrRegNext.getImm()) { + return false; + } + continue; + } + + // Check same base pointer. Be careful of subregisters, which can occur + // with vectors of pointers. + if (AddrReg[i]->getReg() != AddrRegNext.getReg() || + AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { + return false; + } + } + return true; + } + + bool hasMergeableAddress(const MachineRegisterInfo &MRI) { + for (unsigned i = 0; i < NumAddresses; ++i) { + const MachineOperand *AddrOp = AddrReg[i]; + // Immediates are always OK. + if (AddrOp->isImm()) + continue; + + // Don't try to merge addresses that aren't either immediates or registers. + // TODO: Should be possible to merge FrameIndexes and maybe some other + // non-register + if (!AddrOp->isReg()) + return false; + + // TODO: We should be able to merge physical reg addreses. + if (Register::isPhysicalRegister(AddrOp->getReg())) + return false; + + // If an address has only one use then there will be on other + // instructions with the same address, so we can't merge this one. + if (MRI.hasOneNonDBGUse(AddrOp->getReg())) + return false; + } + return true; + } + + void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, + const GCNSubtarget &STM); + void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII); }; struct BaseRegisters { @@ -160,14 +209,12 @@ private: AliasAnalysis *AA = nullptr; bool OptimizeAgain; + static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII); static bool offsetsCanBeCombined(CombineInfo &CI); static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); static unsigned getNewOpcode(const CombineInfo &CI); static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); - unsigned getOpcodeWidth(const MachineInstr &MI); - InstClassEnum getInstClass(unsigned Opc); - unsigned getRegs(unsigned Opc); bool findMatchingInst(CombineInfo &CI); @@ -178,22 +225,27 @@ private: unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); + MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, - int32_t NewOffset); - unsigned computeBase(MachineInstr &MI, const MemAddress &Addr); - MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI); - Optional<int32_t> extractConstOffset(const MachineOperand &Op); - void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr); + int32_t NewOffset) const; + unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const; + MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; + Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; + void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; /// Promotes constant offset to the immediate by adjusting the base. It /// tries to use a base from the nearby instructions that allows it to have /// a 13bit constant offset which gets promoted to the immediate. bool promoteConstantOffsetToImm(MachineInstr &CI, MemInfoMap &Visited, - SmallPtrSet<MachineInstr *, 4> &Promoted); + SmallPtrSet<MachineInstr *, 4> &Promoted) const; + void addInstToMergeableList(const CombineInfo &CI, + std::list<std::list<CombineInfo> > &MergeableInsts) const; + bool collectMergeableInsts(MachineBasicBlock &MBB, + std::list<std::list<CombineInfo> > &MergeableInsts) const; public: static char ID; @@ -202,7 +254,11 @@ public: initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } - bool optimizeBlock(MachineBasicBlock &MBB); + void removeCombinedInst(std::list<CombineInfo> &MergeList, + const MachineInstr &MI); + bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, + bool &OptimizeListAgain); + bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); bool runOnMachineFunction(MachineFunction &MF) override; @@ -216,6 +272,264 @@ public: } }; +static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { + const unsigned Opc = MI.getOpcode(); + + if (TII.isMUBUF(Opc)) { + // FIXME: Handle d16 correctly + return AMDGPU::getMUBUFElements(Opc); + } + if (TII.isMIMG(MI)) { + uint64_t DMaskImm = + TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); + return countPopulation(DMaskImm); + } + + switch (Opc) { + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + return 1; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + return 2; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return 4; + default: + return 0; + } +} + +/// Maps instruction opcode to enum InstClassEnum. +static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { + switch (Opc) { + default: + if (TII.isMUBUF(Opc)) { + switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { + default: + return UNKNOWN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + return BUFFER_LOAD; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + return BUFFER_STORE; + } + } + if (TII.isMIMG(Opc)) { + // Ignore instructions encoded without vaddr. + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) + return UNKNOWN; + // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. + if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc)) + return UNKNOWN; + return MIMG; + } + return UNKNOWN; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return S_BUFFER_LOAD_IMM; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B64_gfx9: + return DS_READ; + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B64_gfx9: + return DS_WRITE; + } +} + +/// Determines instruction subclass from opcode. Only instructions +/// of the same subclass can be merged together. +static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { + switch (Opc) { + default: + if (TII.isMUBUF(Opc)) + return AMDGPU::getMUBUFBaseOpcode(Opc); + if (TII.isMIMG(Opc)) { + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); + assert(Info); + return Info->BaseOpcode; + } + return -1; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B64_gfx9: + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B64_gfx9: + return Opc; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; + } +} + +static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { + if (TII.isMUBUF(Opc)) { + unsigned result = 0; + + if (AMDGPU::getMUBUFHasVAddr(Opc)) { + result |= VADDR; + } + + if (AMDGPU::getMUBUFHasSrsrc(Opc)) { + result |= SRSRC; + } + + if (AMDGPU::getMUBUFHasSoffset(Opc)) { + result |= SOFFSET; + } + + return result; + } + + if (TII.isMIMG(Opc)) { + unsigned result = VADDR | SRSRC; + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); + if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) + result |= SSAMP; + return result; + } + + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return SBASE; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return ADDR; + } +} + + +void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, + const SIInstrInfo &TII, + const GCNSubtarget &STM) { + I = MI; + unsigned Opc = MI->getOpcode(); + InstClass = getInstClass(Opc, TII); + + if (InstClass == UNKNOWN) + return; + + switch (InstClass) { + case DS_READ: + EltSize = + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 + : 4; + break; + case DS_WRITE: + EltSize = + (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 + : 4; + break; + case S_BUFFER_LOAD_IMM: + EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4); + break; + default: + EltSize = 4; + break; + } + + if (InstClass == MIMG) { + DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); + } else { + int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); + Offset0 = I->getOperand(OffsetIdx).getImm(); + } + + Width0 = getOpcodeWidth(*I, TII); + + if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { + Offset0 &= 0xffff; + } else if (InstClass != MIMG) { + GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); + if (InstClass != S_BUFFER_LOAD_IMM) { + SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); + } + DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); + } + + unsigned AddrOpName[5] = {0}; + NumAddresses = 0; + const unsigned Regs = getRegs(I->getOpcode(), TII); + + if (Regs & ADDR) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; + } + + if (Regs & SBASE) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; + } + + if (Regs & SRSRC) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + } + + if (Regs & SOFFSET) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; + } + + if (Regs & VADDR) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; + } + + if (Regs & SSAMP) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp; + } + + for (unsigned i = 0; i < NumAddresses; i++) { + AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); + AddrReg[i] = &I->getOperand(AddrIdx[i]); + } + + InstsToMove.clear(); +} + +void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, + const SIInstrInfo &TII) { + Paired = MI; + assert(InstClass == getInstClass(Paired->getOpcode(), TII)); + + if (InstClass == MIMG) { + DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm(); + } else { + int OffsetIdx = + AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset); + Offset1 = Paired->getOperand(OffsetIdx).getImm(); + } + + Width1 = getOpcodeWidth(*Paired, TII); + if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { + Offset1 &= 0xffff; + } else if (InstClass != MIMG) { + GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm(); + if (InstClass != S_BUFFER_LOAD_IMM) { + SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm(); + } + DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm(); + } +} + + } // end anonymous namespace. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, @@ -249,8 +563,7 @@ static void addDefsUsesToList(const MachineInstr &MI, if (Op.isReg()) { if (Op.isDef()) RegDefs.insert(Op.getReg()); - else if (Op.readsReg() && - TargetRegisterInfo::isPhysicalRegister(Op.getReg())) + else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg())) PhysRegUses.insert(Op.getReg()); } } @@ -282,7 +595,7 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || (Use.isDef() && RegDefs.count(Use.getReg())) || - (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && + (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) && PhysRegUses.count(Use.getReg())))) { Insts.push_back(&MI); addDefsUsesToList(MI, RegDefs, PhysRegUses); @@ -307,7 +620,59 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, return true; } +// This function assumes that \p A and \p B have are identical except for +// size and offset, and they referecne adjacent memory. +static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, + const MachineMemOperand *A, + const MachineMemOperand *B) { + unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); + unsigned Size = A->getSize() + B->getSize(); + // This function adds the offset parameter to the existing offset for A, + // so we pass 0 here as the offset and then manually set it to the correct + // value after the call. + MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); + MMO->setOffset(MinOffset); + return MMO; +} + +bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) { + assert(CI.InstClass == MIMG); + + // Ignore instructions with tfe/lwe set. + const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); + const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); + + if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) + return false; + + // Check other optional immediate operands for equality. + unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, + AMDGPU::OpName::d16, AMDGPU::OpName::unorm, + AMDGPU::OpName::da, AMDGPU::OpName::r128}; + + for (auto op : OperandsToMatch) { + int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); + if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx) + return false; + if (Idx != -1 && + CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm()) + return false; + } + + // Check DMask for overlaps. + unsigned MaxMask = std::max(CI.DMask0, CI.DMask1); + unsigned MinMask = std::min(CI.DMask0, CI.DMask1); + + unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); + if ((1u << AllowedBitsForMin) <= MinMask) + return false; + + return true; +} + bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { + assert(CI.InstClass != MIMG); + // XXX - Would the same offset be OK? Is there any reason this would happen or // be useful? if (CI.Offset0 == CI.Offset1) @@ -384,164 +749,24 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, } } -unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) { - const unsigned Opc = MI.getOpcode(); - - if (TII->isMUBUF(MI)) { - return AMDGPU::getMUBUFDwords(Opc); - } - - switch (Opc) { - default: - return 0; - case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: - return 1; - case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: - return 2; - case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: - return 4; - } -} - -InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) { - if (TII->isMUBUF(Opc)) { - const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc); - - // If we couldn't identify the opcode, bail out. - if (baseOpcode == -1) { - return UNKNOWN; - } - - switch (baseOpcode) { - default: - return UNKNOWN; - case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: - return BUFFER_LOAD_OFFEN; - case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: - return BUFFER_LOAD_OFFSET; - case AMDGPU::BUFFER_STORE_DWORD_OFFEN: - return BUFFER_STORE_OFFEN; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET: - return BUFFER_STORE_OFFSET; - case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: - return BUFFER_LOAD_OFFEN_exact; - case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: - return BUFFER_LOAD_OFFSET_exact; - case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: - return BUFFER_STORE_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: - return BUFFER_STORE_OFFSET_exact; - } - } - - switch (Opc) { - default: - return UNKNOWN; - case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: - return S_BUFFER_LOAD_IMM; - case AMDGPU::DS_READ_B32: - case AMDGPU::DS_READ_B64: - case AMDGPU::DS_READ_B32_gfx9: - case AMDGPU::DS_READ_B64_gfx9: - return DS_READ; - case AMDGPU::DS_WRITE_B32: - case AMDGPU::DS_WRITE_B64: - case AMDGPU::DS_WRITE_B32_gfx9: - case AMDGPU::DS_WRITE_B64_gfx9: - return DS_WRITE; - } -} - -unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) { - if (TII->isMUBUF(Opc)) { - unsigned result = 0; - - if (AMDGPU::getMUBUFHasVAddr(Opc)) { - result |= VADDR; - } - - if (AMDGPU::getMUBUFHasSrsrc(Opc)) { - result |= SRSRC; - } - - if (AMDGPU::getMUBUFHasSoffset(Opc)) { - result |= SOFFSET; - } - - return result; - } - - switch (Opc) { - default: - return 0; - case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: - return SBASE; - case AMDGPU::DS_READ_B32: - case AMDGPU::DS_READ_B64: - case AMDGPU::DS_READ_B32_gfx9: - case AMDGPU::DS_READ_B64_gfx9: - case AMDGPU::DS_WRITE_B32: - case AMDGPU::DS_WRITE_B64: - case AMDGPU::DS_WRITE_B32_gfx9: - case AMDGPU::DS_WRITE_B64_gfx9: - return ADDR; - } -} - bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; const unsigned Opc = CI.I->getOpcode(); - const InstClassEnum InstClass = getInstClass(Opc); + const InstClassEnum InstClass = getInstClass(Opc, *TII); if (InstClass == UNKNOWN) { return false; } + const unsigned InstSubclass = getInstSubclass(Opc, *TII); - const unsigned Regs = getRegs(Opc); - - unsigned AddrOpName[5] = {0}; - int AddrIdx[5]; - const MachineOperand *AddrReg[5]; - unsigned NumAddresses = 0; - - if (Regs & ADDR) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; - } - - if (Regs & SBASE) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; - } - - if (Regs & SRSRC) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; - } - - if (Regs & SOFFSET) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - } - - if (Regs & VADDR) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; - } - - for (unsigned i = 0; i < NumAddresses; i++) { - AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); - AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); - - // We only ever merge operations with the same base address register, so - // don't bother scanning forward if there are no other uses. - if (AddrReg[i]->isReg() && - (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || - MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) - return false; - } + // Do not merge VMEM buffer instructions with "swizzled" bit set. + int Swizzled = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); + if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) + return false; ++MBBI; @@ -550,11 +775,10 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); for (; MBBI != E; ++MBBI) { - const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE); - if ((getInstClass(MBBI->getOpcode()) != InstClass) || - (IsDS && (MBBI->getOpcode() != Opc))) { - // This is not a matching DS instruction, but we can keep looking as + if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || + (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { + // This is not a matching instruction, but we can keep looking as // long as one of these conditions are met: // 1. It is safe to move I down past MBBI. // 2. It is safe to move MBBI down past the instruction that I will @@ -599,58 +823,23 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { CI.InstsToMove)) continue; - bool Match = true; - for (unsigned i = 0; i < NumAddresses; i++) { - const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); - - if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { - if (AddrReg[i]->isImm() != AddrRegNext.isImm() || - AddrReg[i]->getImm() != AddrRegNext.getImm()) { - Match = false; - break; - } - continue; - } - - // Check same base pointer. Be careful of subregisters, which can occur - // with vectors of pointers. - if (AddrReg[i]->getReg() != AddrRegNext.getReg() || - AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { - Match = false; - break; - } - } + bool Match = CI.hasSameBaseAddress(*MBBI); if (Match) { - int OffsetIdx = - AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); - CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); - CI.Width0 = getOpcodeWidth(*CI.I); - CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); - CI.Width1 = getOpcodeWidth(*MBBI); - CI.Paired = MBBI; - - if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { - CI.Offset0 &= 0xffff; - CI.Offset1 &= 0xffff; - } else { - CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); - CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); - if (CI.InstClass != S_BUFFER_LOAD_IMM) { - CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); - CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); - } - CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm(); - CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm(); - } + CI.setPaired(MBBI, *TII); + + // Check both offsets (or masks for MIMG) can be combined and fit in the + // reduced range. + bool canBeCombined = + CI.InstClass == MIMG + ? dmasksCanBeCombined(CI, *TII) + : widthsFit(*STM, CI) && offsetsCanBeCombined(CI); - // Check both offsets fit in the reduced range. // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) - if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) - return true; + if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) + return true; } // We've found a load/store that we couldn't merge for some reason. @@ -711,15 +900,15 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { const TargetRegisterClass *SuperRC = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; - unsigned DestReg = MRI->createVirtualRegister(SuperRC); + Register DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = CI.I->getDebugLoc(); - unsigned BaseReg = AddrReg->getReg(); + Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { - unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) .addImm(CI.BaseOff); @@ -755,12 +944,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Next; + return Read2; } unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { @@ -809,11 +997,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); - unsigned BaseReg = AddrReg->getReg(); + Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { - unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) .addImm(CI.BaseOff); @@ -839,12 +1027,65 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { moveInstsAfter(Write2, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Next; + return Write2; +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + const unsigned Opcode = getNewOpcode(CI); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + + Register DestReg = MRI->createVirtualRegister(SuperRC); + unsigned MergedDMask = CI.DMask0 | CI.DMask1; + unsigned DMaskIdx = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); + + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); + for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { + if (I == DMaskIdx) + MIB.addImm(MergedDMask); + else + MIB.add((*CI.I).getOperand(I)); + } + + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + + BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + moveInstsAfter(Copy1, CI.InstsToMove); + + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return New; } MachineBasicBlock::iterator @@ -855,15 +1096,24 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - unsigned DestReg = MRI->createVirtualRegister(SuperRC); + Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); - BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.DLC0) // dlc - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = + BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.DLC0) // dlc + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -883,10 +1133,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineBasicBlock::iterator @@ -899,24 +1148,34 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); // Copy to the new source register. - unsigned DestReg = MRI->createVirtualRegister(SuperRC); + Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); - const unsigned Regs = getRegs(Opcode); + const unsigned Regs = getRegs(Opcode, *TII); if (Regs & VADDR) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -936,10 +1195,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { @@ -947,7 +1205,10 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { switch (CI.InstClass) { default: - return AMDGPU::getMUBUFOpcode(CI.InstClass, Width); + assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); + // FIXME: Handle d16 correctly + return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), + Width); case UNKNOWN: llvm_unreachable("Unknown instruction class"); case S_BUFFER_LOAD_IMM: @@ -959,76 +1220,47 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { case 4: return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; } + case MIMG: + assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width)); + return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); } } std::pair<unsigned, unsigned> SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { - if (CI.Offset0 > CI.Offset1) { - switch (CI.Width0) { - default: - return std::make_pair(0, 0); - case 1: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); - case 2: - return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); - case 3: - return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); - } - case 2: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); - case 2: - return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); - } - case 3: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); - } - } + + if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4) + return std::make_pair(0, 0); + + bool ReverseOrder; + if (CI.InstClass == MIMG) { + assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) && + "No overlaps"); + ReverseOrder = CI.DMask0 > CI.DMask1; + } else + ReverseOrder = CI.Offset0 > CI.Offset1; + + static const unsigned Idxs[4][4] = { + {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, + {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, + {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, + {AMDGPU::sub3, 0, 0, 0}, + }; + unsigned Idx0; + unsigned Idx1; + + assert(CI.Width0 >= 1 && CI.Width0 <= 3); + assert(CI.Width1 >= 1 && CI.Width1 <= 3); + + if (ReverseOrder) { + Idx1 = Idxs[0][CI.Width1 - 1]; + Idx0 = Idxs[CI.Width1][CI.Width0 - 1]; } else { - switch (CI.Width0) { - default: - return std::make_pair(0, 0); - case 1: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); - case 2: - return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); - case 3: - return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); - } - case 2: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); - case 2: - return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); - } - case 3: - switch (CI.Width1) { - default: - return std::make_pair(0, 0); - case 1: - return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); - } - } + Idx0 = Idxs[0][CI.Width0 - 1]; + Idx1 = Idxs[CI.Width0][CI.Width1 - 1]; } + + return std::make_pair(Idx0, Idx1); } const TargetRegisterClass * @@ -1040,7 +1272,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { case 2: return &AMDGPU::SReg_64_XEXECRegClass; case 4: - return &AMDGPU::SReg_128RegClass; + return &AMDGPU::SGPR_128RegClass; case 8: return &AMDGPU::SReg_256RegClass; case 16: @@ -1073,7 +1305,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { // Copy to the new source register. const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - unsigned SrcReg = MRI->createVirtualRegister(SuperRC); + Register SrcReg = MRI->createVirtualRegister(SuperRC); const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); @@ -1087,35 +1319,45 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); - const unsigned Regs = getRegs(Opcode); + const unsigned Regs = getRegs(Opcode, *TII); if (Regs & VADDR) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(std::min(CI.Offset0, CI.Offset1)) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); moveInstsAfter(MIB, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineOperand -SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) { +SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { APInt V(32, Val, true); if (TII->isInlineConstant(V)) return MachineOperand::CreateImm(Val); - unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); MachineInstr *Mov = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg) @@ -1127,7 +1369,7 @@ SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) { // Compute base address using Addr and return the final register. unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, - const MemAddress &Addr) { + const MemAddress &Addr) const { MachineBasicBlock *MBB = MI.getParent(); MachineBasicBlock::iterator MBBI = MI.getIterator(); DebugLoc DL = MI.getDebugLoc(); @@ -1146,11 +1388,11 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned CarryReg = MRI->createVirtualRegister(CarryRC); - unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC); + Register CarryReg = MRI->createVirtualRegister(CarryRC); + Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); - unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); MachineInstr *LoHalf = BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) .addReg(CarryReg, RegState::Define) @@ -1170,7 +1412,7 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, (void)HiHalf; LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); - unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); MachineInstr *FullBase = BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) @@ -1186,13 +1428,13 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, // Update base and offset with the NewBase and NewOffset in MI. void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, unsigned NewBase, - int32_t NewOffset) { + int32_t NewOffset) const { TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase); TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); } Optional<int32_t> -SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) { +SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { if (Op.isImm()) return Op.getImm(); @@ -1218,7 +1460,7 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) { // %Base:vreg_64 = // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, - MemAddress &Addr) { + MemAddress &Addr) const { if (!Base.isReg()) return; @@ -1273,15 +1515,16 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MachineInstr &MI, MemInfoMap &Visited, - SmallPtrSet<MachineInstr *, 4> &AnchorList) { + SmallPtrSet<MachineInstr *, 4> &AnchorList) const { + + if (!(MI.mayLoad() ^ MI.mayStore())) + return false; // TODO: Support flat and scratch. - if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 || - TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) + if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) return false; - // TODO: Support Store. - if (!MI.mayLoad()) + if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) return false; if (AnchorList.count(&MI)) @@ -1418,100 +1661,166 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( return false; } -// Scan through looking for adjacent LDS operations with constant offsets from -// the same base register. We rely on the scheduler to do the hard work of -// clustering nearby loads, and assume these are all adjacent. -bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { - bool Modified = false; +void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, + std::list<std::list<CombineInfo> > &MergeableInsts) const { + for (std::list<CombineInfo> &AddrList : MergeableInsts) { + if (AddrList.front().hasSameBaseAddress(*CI.I) && + AddrList.front().InstClass == CI.InstClass) { + AddrList.emplace_back(CI); + return; + } + } + + // Base address not found, so add a new list. + MergeableInsts.emplace_back(1, CI); +} +bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB, + std::list<std::list<CombineInfo> > &MergeableInsts) const { + bool Modified = false; // Contain the list MemInfoMap Visited; // Contains the list of instructions for which constant offsets are being // promoted to the IMM. SmallPtrSet<MachineInstr *, 4> AnchorList; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - MachineInstr &MI = *I; - + // Sort potential mergeable instructions into lists. One list per base address. + for (MachineInstr &MI : MBB.instrs()) { + // We run this before checking if an address is mergeable, because it can produce + // better code even if the instructions aren't mergeable. if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) Modified = true; + const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); + if (InstClass == UNKNOWN) + continue; + // Don't combine if volatile. - if (MI.hasOrderedMemoryRef()) { - ++I; + if (MI.hasOrderedMemoryRef()) + continue; + + CombineInfo CI; + CI.setMI(MI, *TII, *STM); + + if (!CI.hasMergeableAddress(*MRI)) + continue; + + addInstToMergeableList(CI, MergeableInsts); + } + return Modified; +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock( + std::list<std::list<CombineInfo> > &MergeableInsts) { + bool Modified = false; + + for (std::list<CombineInfo> &MergeList : MergeableInsts) { + if (MergeList.size() < 2) + continue; + + bool OptimizeListAgain = false; + if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { + // We weren't able to make any changes, so clear the list so we don't + // process the same instructions the next time we try to optimize this + // block. + MergeList.clear(); continue; } - const unsigned Opc = MI.getOpcode(); + // We made changes, but also determined that there were no more optimization + // opportunities, so we don't need to reprocess the list + if (!OptimizeListAgain) + MergeList.clear(); - CombineInfo CI; - CI.I = I; - CI.InstClass = getInstClass(Opc); + OptimizeAgain |= OptimizeListAgain; + Modified = true; + } + return Modified; +} + +void +SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList, + const MachineInstr &MI) { + + for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) { + if (&*CI->I == &MI) { + MergeList.erase(CI); + return; + } + } +} + +bool +SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( + std::list<CombineInfo> &MergeList, + bool &OptimizeListAgain) { + bool Modified = false; + for (auto I = MergeList.begin(); I != MergeList.end(); ++I) { + CombineInfo &CI = *I; switch (CI.InstClass) { default: break; case DS_READ: - CI.EltSize = - (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 - : 4; if (findMatchingInst(CI)) { Modified = true; - I = mergeRead2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case DS_WRITE: - CI.EltSize = - (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 - : 4; if (findMatchingInst(CI)) { Modified = true; - I = mergeWrite2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case S_BUFFER_LOAD_IMM: - CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); if (findMatchingInst(CI)) { Modified = true; - I = mergeSBufferLoadImmPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16; } - continue; - case BUFFER_LOAD_OFFEN: - case BUFFER_LOAD_OFFSET: - case BUFFER_LOAD_OFFEN_exact: - case BUFFER_LOAD_OFFSET_exact: - CI.EltSize = 4; + break; + case BUFFER_LOAD: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferLoadPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; - case BUFFER_STORE_OFFEN: - case BUFFER_STORE_OFFSET: - case BUFFER_STORE_OFFEN_exact: - case BUFFER_STORE_OFFSET_exact: - CI.EltSize = 4; + break; + case BUFFER_STORE: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferStorePair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; + case MIMG: + if (findMatchingInst(CI)) { + Modified = true; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeImagePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; + } + break; } - - ++I; + // Clear the InstsToMove after we have finished searching so we don't have + // stale values left over if we search for this CI again in another pass + // over the block. + CI.InstsToMove.clear(); } return Modified; @@ -1537,10 +1846,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Modified = false; + for (MachineBasicBlock &MBB : MF) { + std::list<std::list<CombineInfo> > MergeableInsts; + // First pass: Collect list of all instructions we know how to merge. + Modified |= collectMergeableInsts(MBB, MergeableInsts); do { OptimizeAgain = false; - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MergeableInsts); } while (OptimizeAgain); } diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 78f409cd9555..6f9abd3a8d9b 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -98,6 +98,8 @@ private: void emitLoop(MachineInstr &MI); void emitEndCf(MachineInstr &MI); + Register getSaveExec(MachineInstr* MI); + void findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl<MachineOperand> &Src) const; @@ -144,7 +146,7 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, const SIInstrInfo *TII) { - unsigned SaveExecReg = MI.getOperand(0).getReg(); + Register SaveExecReg = MI.getOperand(0).getReg(); auto U = MRI->use_instr_nodbg_begin(SaveExecReg); if (U == MRI->use_instr_nodbg_end() || @@ -175,17 +177,31 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, return true; } +Register SILowerControlFlow::getSaveExec(MachineInstr *MI) { + MachineBasicBlock *MBB = MI->getParent(); + MachineOperand &SaveExec = MI->getOperand(0); + assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister); + + Register SaveExecReg = SaveExec.getReg(); + unsigned FalseTermOpc = + TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; + MachineBasicBlock::iterator I = (MI); + MachineBasicBlock::iterator J = std::next(I); + if (J != MBB->end() && J->getOpcode() == FalseTermOpc && + J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) { + SaveExecReg = J->getOperand(0).getReg(); + J->eraseFromParent(); + } + return SaveExecReg; +} + void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - - MachineOperand &SaveExec = MI.getOperand(0); - MachineOperand &Cond = MI.getOperand(1); - assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister && - Cond.getSubReg() == AMDGPU::NoSubRegister); - - Register SaveExecReg = SaveExec.getReg(); + Register SaveExecReg = getSaveExec(&MI); + MachineOperand& Cond = MI.getOperand(1); + assert(Cond.getSubReg() == AMDGPU::NoSubRegister); MachineOperand &ImpDefSCC = MI.getOperand(4); assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); @@ -204,7 +220,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { .addReg(Exec) .addReg(Exec, RegState::ImplicitDefine); - unsigned Tmp = MRI->createVirtualRegister(BoolRC); + Register Tmp = MRI->createVirtualRegister(BoolRC); MachineInstr *And = BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp) @@ -266,8 +282,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - Register DstReg = MI.getOperand(0).getReg(); - assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister); + Register DstReg = getSaveExec(&MI); bool ExecModified = MI.getOperand(3).getImm() != 0; MachineBasicBlock::iterator Start = MBB.begin(); @@ -339,7 +354,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - auto Dst = MI.getOperand(0).getReg(); + auto Dst = getSaveExec(&MI); // Skip ANDing with exec if the break condition is already masked by exec // because it is a V_CMP in the same basic block. (We know the break @@ -400,13 +415,17 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + unsigned CFMask = MI.getOperand(0).getReg(); + MachineInstr *Def = MRI.getUniqueVRegDef(CFMask); const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock::iterator InsPt = MBB.begin(); - MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) - .addReg(Exec) - .add(MI.getOperand(0)); + MachineBasicBlock::iterator InsPt = + Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def)) + : MBB.begin(); + MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) + .addReg(Exec) + .add(MI.getOperand(0)); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *NewMI); @@ -422,7 +441,7 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) { void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl<MachineOperand> &Src) const { MachineOperand &Op = MI.getOperand(OpNo); - if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) { + if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) { Src.push_back(Op); return; } @@ -442,8 +461,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, for (const auto &SrcOp : Def->explicit_operands()) if (SrcOp.isReg() && SrcOp.isUse() && - (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) || - SrcOp.getReg() == Exec)) + (Register::isVirtualRegister(SrcOp.getReg()) || SrcOp.getReg() == Exec)) Src.push_back(SrcOp); } @@ -466,7 +484,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; else return; - unsigned Reg = MI.getOperand(OpToReplace).getReg(); + Register Reg = MI.getOperand(OpToReplace).getReg(); MI.RemoveOperand(OpToReplace); MI.addOperand(Ops[UniqueOpndIdx]); if (MRI->use_empty(Reg)) diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index 1c0f836f07e6..b45412536356 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -96,7 +96,7 @@ private: getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; bool isVreg1(unsigned Reg) const { - return TargetRegisterInfo::isVirtualRegister(Reg) && + return Register::isVirtualRegister(Reg) && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; } @@ -489,6 +489,15 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { return true; } +#ifndef NDEBUG +static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + Register Reg) { + unsigned Size = TRI.getRegSizeInBits(Reg, MRI); + return Size == 1 || Size == 32; +} +#endif + void SILowerI1Copies::lowerCopiesFromI1() { SmallVector<MachineInstr *, 4> DeadCopies; @@ -497,8 +506,8 @@ void SILowerI1Copies::lowerCopiesFromI1() { if (MI.getOpcode() != AMDGPU::COPY) continue; - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); if (!isVreg1(SrcReg)) continue; @@ -509,7 +518,7 @@ void SILowerI1Copies::lowerCopiesFromI1() { LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); DebugLoc DL = MI.getDebugLoc(); - assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32); + assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg)); assert(!MI.getOperand(0).getSubReg()); ConstrainRegs.insert(SrcReg); @@ -544,7 +553,7 @@ void SILowerI1Copies::lowerPhis() { LF.initialize(MBB); for (MachineInstr &MI : MBB.phis()) { - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); if (!isVreg1(DstReg)) continue; @@ -556,7 +565,7 @@ void SILowerI1Copies::lowerPhis() { // Collect incoming values. for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { assert(i + 1 < MI.getNumOperands()); - unsigned IncomingReg = MI.getOperand(i).getReg(); + Register IncomingReg = MI.getOperand(i).getReg(); MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB(); MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); @@ -580,12 +589,12 @@ void SILowerI1Copies::lowerPhis() { // Phis in a loop that are observed outside the loop receive a simple but // conservatively correct treatment. - MachineBasicBlock *PostDomBound = &MBB; - for (MachineInstr &Use : MRI->use_instructions(DstReg)) { - PostDomBound = - PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); - } + std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) + DomBlocks.push_back(Use.getParent()); + MachineBasicBlock *PostDomBound = + PDT->findNearestCommonDominator(DomBlocks); unsigned FoundLoopLevel = LF.findLoop(PostDomBound); SSAUpdater.Initialize(DstReg); @@ -669,7 +678,7 @@ void SILowerI1Copies::lowerCopiesToI1() { MI.getOpcode() != AMDGPU::COPY) continue; - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); if (!isVreg1(DstReg)) continue; @@ -686,10 +695,10 @@ void SILowerI1Copies::lowerCopiesToI1() { continue; DebugLoc DL = MI.getDebugLoc(); - unsigned SrcReg = MI.getOperand(1).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); assert(!MI.getOperand(1).getSubReg()); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + if (!Register::isVirtualRegister(SrcReg) || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); unsigned TmpReg = createLaneMaskReg(*MF); @@ -702,12 +711,12 @@ void SILowerI1Copies::lowerCopiesToI1() { // Defs in a loop that are observed outside the loop must be transformed // into appropriate bit manipulation. - MachineBasicBlock *PostDomBound = &MBB; - for (MachineInstr &Use : MRI->use_instructions(DstReg)) { - PostDomBound = - PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); - } + std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) + DomBlocks.push_back(Use.getParent()); + MachineBasicBlock *PostDomBound = + PDT->findNearestCommonDominator(DomBlocks); unsigned FoundLoopLevel = LF.findLoop(PostDomBound); if (FoundLoopLevel) { SSAUpdater.Initialize(DstReg); @@ -734,7 +743,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { break; Reg = MI->getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return false; if (!isLaneMaskReg(Reg)) return false; diff --git a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index a82047473370..714d403a3e8f 100644 --- a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -278,8 +278,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr); int FI = MI.getOperand(FIOp).getIndex(); - unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata) - ->getReg(); + Register VReg = + TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg))) { TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr); diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 46da974a2f45..7dd0f11c95de 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -53,8 +53,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); - Occupancy = getMaxWavesPerEU(); - limitOccupancy(MF); + Occupancy = ST.computeOccupancy(MF, getLDSSize()); CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { @@ -190,7 +189,7 @@ unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( const SIRegisterInfo &TRI) { ArgInfo.PrivateSegmentBuffer = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass)); + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass)); NumUserSGPRs += 4; return ArgInfo.PrivateSegmentBuffer.getRegister(); } @@ -487,6 +486,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), + HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)), FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), @@ -501,8 +501,9 @@ void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { bool SIMachineFunctionInfo::initializeBaseYamlFields( const yaml::SIMachineFunctionInfo &YamlMFI) { ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize; - MaxKernArgAlign = YamlMFI.MaxKernArgAlign; + MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign); LDSSize = YamlMFI.LDSSize; + HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress; IsEntryFunction = YamlMFI.IsEntryFunction; NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath; MemoryBound = YamlMFI.MemoryBound; diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index f19b20ceb5da..7d70c786b594 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -265,6 +265,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool NoSignedZerosFPMath = false; bool MemoryBound = false; bool WaveLimiter = false; + uint32_t HighBitsOf32BitAddress = 0; StringValue ScratchRSrcReg = "$private_rsrc_reg"; StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg"; @@ -302,6 +303,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { StringValue("$sp_reg")); YamlIO.mapOptional("argumentInfo", MFI.ArgInfo); YamlIO.mapOptional("mode", MFI.Mode, SIMode()); + YamlIO.mapOptional("highBitsOf32BitAddress", + MFI.HighBitsOf32BitAddress, 0u); } }; @@ -670,7 +673,7 @@ public: return GITPtrHigh; } - unsigned get32BitAddressHighBits() const { + uint32_t get32BitAddressHighBits() const { return HighBitsOf32BitAddress; } @@ -873,7 +876,7 @@ public: assert(BufferRsrc); auto PSV = BufferPSVs.try_emplace( BufferRsrc, - llvm::make_unique<AMDGPUBufferPseudoSourceValue>(TII)); + std::make_unique<AMDGPUBufferPseudoSourceValue>(TII)); return PSV.first->second.get(); } @@ -882,14 +885,14 @@ public: assert(ImgRsrc); auto PSV = ImagePSVs.try_emplace( ImgRsrc, - llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII)); + std::make_unique<AMDGPUImagePseudoSourceValue>(TII)); return PSV.first->second.get(); } const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) { if (!GWSResourcePSV) { GWSResourcePSV = - llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII); + std::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII); } return GWSResourcePSV.get(); diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index ebbdf80f9567..c072ba6b2d1c 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -348,7 +348,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // Do not Track Physical Registers, because it messes up. for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { - if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit)) + if (Register::isVirtualRegister(RegMaskPair.RegUnit)) LiveInRegs.insert(RegMaskPair.RegUnit); } LiveOutRegs.clear(); @@ -376,7 +376,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // The use of findDefBetween removes the case 4. for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { unsigned Reg = RegMaskPair.RegUnit; - if (TargetRegisterInfo::isVirtualRegister(Reg) && + if (Register::isVirtualRegister(Reg) && isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, LIS)) { @@ -1228,7 +1228,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria unsigned Color = CurrentColoring[SU->NodeNum]; if (RealID.find(Color) == RealID.end()) { int ID = CurrentBlocks.size(); - BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID)); + BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID)); CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); RealID[Color] = ID; } @@ -1690,7 +1690,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { for (unsigned Reg : Regs) { // For now only track virtual registers. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) continue; // If not already in the live set, then add it. (void) LiveRegs.insert(Reg); @@ -1750,7 +1750,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, for (unsigned Reg : InRegs) { // For now only track virtual registers. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) continue; if (LiveRegsConsumers[Reg] > 1) continue; @@ -1762,7 +1762,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, for (unsigned Reg : OutRegs) { // For now only track virtual registers. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) continue; PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); for (; PSetI.isValid(); ++PSetI) { @@ -1801,7 +1801,7 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, // SIScheduleDAGMI // SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : - ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) { + ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) { SITII = static_cast<const SIInstrInfo*>(TII); SITRI = static_cast<const SIRegisterInfo*>(TRI); @@ -1913,7 +1913,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, for (_Iterator RegI = First; RegI != End; ++RegI) { unsigned Reg = *RegI; // For now only track virtual registers - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) continue; PSetIterator PSetI = MRI.getPressureSets(Reg); for (; PSetI.isValid(); ++PSetI) { diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 4320e6c957a0..e914573306ae 100644 --- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -656,10 +656,10 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) { std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return make_unique<SIGfx6CacheControl>(ST); + return std::make_unique<SIGfx6CacheControl>(ST); if (Generation < AMDGPUSubtarget::GFX10) - return make_unique<SIGfx7CacheControl>(ST); - return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled()); + return std::make_unique<SIGfx7CacheControl>(ST); + return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled()); } bool SIGfx6CacheControl::enableLoadCacheBypass( diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp index a5edd7b3554a..52989a280e80 100644 --- a/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/lib/Target/AMDGPU/SIModeRegister.cpp @@ -226,7 +226,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, // - on exit we have set the Require, Change, and initial Exit modes. void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII) { - auto NewInfo = llvm::make_unique<BlockData>(); + auto NewInfo = std::make_unique<BlockData>(); MachineInstr *InsertionPoint = nullptr; // RequirePending is used to indicate whether we are collecting the initial // requirements for the block, and need to defer the first InsertionPoint to diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 3227bff20513..cc9b46a75582 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -322,7 +322,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { continue; } - unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); + Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); MachineInstr *SaveExecInst = nullptr; SmallVector<MachineInstr *, 4> OtherUseInsts; diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 7e10316eab92..fdd30db6a7cb 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -211,7 +211,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, return AMDGPU::NoRegister; MachineOperand *AndCC = &And->getOperand(1); - unsigned CmpReg = AndCC->getReg(); + Register CmpReg = AndCC->getReg(); unsigned CmpSubReg = AndCC->getSubReg(); if (CmpReg == ExecReg) { AndCC = &And->getOperand(2); @@ -234,7 +234,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1) return AMDGPU::NoRegister; - unsigned SelReg = Op1->getReg(); + Register SelReg = Op1->getReg(); auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS); if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) return AMDGPU::NoRegister; @@ -250,15 +250,16 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, Op1->getImm() != 0 || Op2->getImm() != 1) return AMDGPU::NoRegister; - LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' - << *Cmp << '\t' << *And); + LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t' + << *And); - unsigned CCReg = CC->getReg(); + Register CCReg = CC->getReg(); LIS->RemoveMachineInstrFromMaps(*And); - MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(), - TII->get(Andn2Opc), And->getOperand(0).getReg()) - .addReg(ExecReg) - .addReg(CCReg, 0, CC->getSubReg()); + MachineInstr *Andn2 = + BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc), + And->getOperand(0).getReg()) + .addReg(ExecReg) + .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg()); And->eraseFromParent(); LIS->InsertMachineInstrInMaps(*Andn2); @@ -266,20 +267,19 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, // Try to remove compare. Cmp value should not used in between of cmp // and s_and_b64 if VCC or just unused if any other register. - if ((TargetRegisterInfo::isVirtualRegister(CmpReg) && - MRI.use_nodbg_empty(CmpReg)) || + if ((Register::isVirtualRegister(CmpReg) && MRI.use_nodbg_empty(CmpReg)) || (CmpReg == CondReg && std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(), [&](const MachineInstr &MI) { - return MI.readsRegister(CondReg, TRI); }))) { + return MI.readsRegister(CondReg, TRI); + }))) { LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n'); LIS->RemoveMachineInstrFromMaps(*Cmp); Cmp->eraseFromParent(); // Try to remove v_cndmask_b32. - if (TargetRegisterInfo::isVirtualRegister(SelReg) && - MRI.use_nodbg_empty(SelReg)) { + if (Register::isVirtualRegister(SelReg) && MRI.use_nodbg_empty(SelReg)) { LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); LIS->RemoveMachineInstrFromMaps(*Sel); @@ -413,7 +413,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (!SaveExec || !SaveExec->isFullCopy()) continue; - unsigned SavedExec = SaveExec->getOperand(0).getReg(); + Register SavedExec = SaveExec->getOperand(0).getReg(); bool SafeToReplace = true; for (auto& U : MRI.use_nodbg_instructions(SavedExec)) { if (U.getParent() != SaveExec->getParent()) { @@ -434,7 +434,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (Changed) { for (auto Reg : RecalcRegs) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { LIS->removeInterval(Reg); if (!MRI.reg_empty(Reg)) LIS->createAndComputeVirtRegInterval(Reg); diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 2d71abc0612a..9b3b2436475c 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -574,16 +574,16 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(Src1->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) + if (Register::isPhysicalRegister(Src1->getReg()) || + Register::isPhysicalRegister(Dst->getReg())) break; if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || Opcode == AMDGPU::V_LSHLREV_B32_e64) { - return make_unique<SDWADstOperand>( + return std::make_unique<SDWADstOperand>( Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); } else { - return make_unique<SDWASrcOperand>( + return std::make_unique<SDWASrcOperand>( Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, Opcode != AMDGPU::V_LSHRREV_B32_e32 && Opcode != AMDGPU::V_LSHRREV_B32_e64); @@ -613,15 +613,15 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(Src1->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) + if (Register::isPhysicalRegister(Src1->getReg()) || + Register::isPhysicalRegister(Dst->getReg())) break; if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || Opcode == AMDGPU::V_LSHLREV_B16_e64) { - return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); + return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); } else { - return make_unique<SDWASrcOperand>( + return std::make_unique<SDWASrcOperand>( Src1, Dst, BYTE_1, false, false, Opcode != AMDGPU::V_LSHRREV_B16_e32 && Opcode != AMDGPU::V_LSHRREV_B16_e64); @@ -677,11 +677,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(Src0->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) + if (Register::isPhysicalRegister(Src0->getReg()) || + Register::isPhysicalRegister(Dst->getReg())) break; - return make_unique<SDWASrcOperand>( + return std::make_unique<SDWASrcOperand>( Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); } @@ -706,11 +706,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(ValSrc->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) + if (Register::isPhysicalRegister(ValSrc->getReg()) || + Register::isPhysicalRegister(Dst->getReg())) break; - return make_unique<SDWASrcOperand>( + return std::make_unique<SDWASrcOperand>( ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); } @@ -840,7 +840,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); assert(OrDst && OrDst->isReg()); - return make_unique<SDWADstPreserveOperand>( + return std::make_unique<SDWADstPreserveOperand>( OrDst, OrSDWADef, OrOtherDef, DstSel); } @@ -1189,7 +1189,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, continue; } - unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), VGPR); if (Op.isImm()) diff --git a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index f9bfe96f65cb..6cdd12d0e7bd 100644 --- a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -90,12 +90,12 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) { if (!MO.isReg()) return false; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!TRI->isVGPR(*MRI, Reg)) return false; - if (TRI->isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return false; if (VRM->hasPhys(Reg)) @@ -124,14 +124,14 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { if (!MO.isReg()) continue; - const unsigned VirtReg = MO.getReg(); - if (TRI->isPhysicalRegister(VirtReg)) + const Register VirtReg = MO.getReg(); + if (Register::isPhysicalRegister(VirtReg)) continue; if (!VRM->hasPhys(VirtReg)) continue; - unsigned PhysReg = VRM->getPhys(VirtReg); + Register PhysReg = VRM->getPhys(VirtReg); const unsigned SubReg = MO.getSubReg(); if (SubReg != 0) { PhysReg = TRI->getSubReg(PhysReg, SubReg); @@ -149,7 +149,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { for (unsigned Reg : RegsToRewrite) { LIS->removeInterval(Reg); - const unsigned PhysReg = VRM->getPhys(Reg); + const Register PhysReg = VRM->getPhys(Reg); assert(PhysReg != 0); MFI->ReserveWWMRegister(PhysReg); } diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h index 168f05f8fdd6..7c039a54b57f 100644 --- a/lib/Target/AMDGPU/SIProgramInfo.h +++ b/lib/Target/AMDGPU/SIProgramInfo.h @@ -41,6 +41,8 @@ struct SIProgramInfo { uint64_t ComputePGMRSrc2 = 0; uint32_t NumVGPR = 0; + uint32_t NumArchVGPR = 0; + uint32_t NumAccVGPR = 0; uint32_t NumSGPR = 0; uint32_t LDSSize = 0; bool FlatUsed = false; @@ -51,6 +53,9 @@ struct SIProgramInfo { // Number of VGPRs that meets number of waves per execution unit request. uint32_t NumVGPRsForWavesPerEU = 0; + // Final occupancy. + uint32_t Occupancy = 0; + // Whether there is recursion, dynamic allocas, indirect calls or some other // reason there may be statically unknown stack usage. bool DynamicCallStack = false; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index f152deb28004..f58bc3060c42 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -48,11 +48,6 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, } } -static cl::opt<bool> EnableSpillSGPRToSMEM( - "amdgpu-spill-sgpr-to-smem", - cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), - cl::init(false)); - static cl::opt<bool> EnableSpillSGPRToVGPR( "amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling VGPRs to SGPRs"), @@ -61,17 +56,12 @@ static cl::opt<bool> EnableSpillSGPRToVGPR( SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPURegisterInfo(), + ST(ST), SGPRPressureSets(getNumRegPressureSets()), VGPRPressureSets(getNumRegPressureSets()), AGPRPressureSets(getNumRegPressureSets()), - SpillSGPRToVGPR(false), - SpillSGPRToSMEM(false), + SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { - if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) - SpillSGPRToSMEM = true; - else if (EnableSpillSGPRToVGPR) - SpillSGPRToVGPR = true; - unsigned NumRegPressureSets = getNumRegPressureSets(); SGPRSetID = NumRegPressureSets; @@ -118,11 +108,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); - return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); } static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { @@ -144,7 +132,6 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); return AMDGPU::SGPR_32RegClass.getRegister(Reg); } @@ -202,8 +189,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AMDGPU::VCC_HI); } - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { @@ -220,6 +205,14 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, Reg); } + // Reserve all the rest AGPRs if there are no instructions to use it. + if (!ST.hasMAIInsts()) { + for (unsigned i = 0; i < MaxNumVGPRs; ++i) { + unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } + } + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); @@ -293,32 +286,17 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const bool SIRegisterInfo::requiresFrameIndexScavenging( const MachineFunction &MF) const { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.hasStackObjects()) - return true; - - // May need to deal with callee saved registers. - const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - return !Info->isEntryFunction(); + // Do not use frame virtual registers. They used to be used for SGPRs, but + // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the + // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a + // spill. + return false; } bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (!MFI.hasStackObjects()) - return false; - - // The scavenger is used for large frames which may require finding a free - // register for large offsets. - if (!isUInt<12>(MFI.getStackSize())) - return true; - - // If using scalar stores, for spills, m0 is needed for the scalar store - // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual - // register for it during frame index elimination, so the scavenger is - // directly needed. - return MF.getSubtarget<GCNSubtarget>().hasScalarStores() && - MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); + return MFI.hasStackObjects(); } bool SIRegisterInfo::requiresVirtualBaseRegisters( @@ -372,8 +350,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, DL = Ins->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = Subtarget.getInstrInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); if (Offset == 0) { BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) @@ -382,9 +359,9 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, } MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(Offset); @@ -399,11 +376,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, int64_t Offset) const { - - MachineBasicBlock *MBB = MI.getParent(); - MachineFunction *MF = MBB->getParent(); - const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = Subtarget.getInstrInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); #ifndef NDEBUG // FIXME: Is it possible to be storing a frame index to itself? @@ -419,12 +392,15 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, #endif MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); +#ifndef NDEBUG + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction *MF = MBB->getParent(); +#endif assert(FIOp && FIOp->isFI() && "frame index must be address operand"); assert(TII->isMUBUF(MI)); assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == - MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() && - "should only be seeing frame offset relative FrameIndex"); - + MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() && + "should only be seeing stack pointer offset relative FrameIndex"); MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); int64_t NewOffset = OffsetOp->getImm() + Offset; @@ -564,7 +540,8 @@ static int getOffsetMUBUFLoad(unsigned Opc) { } } -static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, +static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, + MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, @@ -572,7 +549,6 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane); @@ -595,11 +571,12 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI, // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not // need to handle the case where an SGPR may need to be spilled while spilling. -static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, +static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset) { + const SIInstrInfo *TII = ST.getInstrInfo(); MachineBasicBlock *MBB = MI->getParent(); const DebugLoc &DL = MI->getDebugLoc(); bool IsStore = MI->mayStore(); @@ -611,7 +588,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, return false; const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); - if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr()) + if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr()) return true; MachineInstrBuilder NewMI = @@ -624,6 +601,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .cloneMemRefs(*MI); const MachineOperand *VDataIn = TII->getNamedOperand(*MI, @@ -645,7 +623,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, RegScavenger *RS) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); @@ -707,8 +684,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, } for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { - unsigned SubReg = NumSubRegs == 1 ? - ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i)); + Register SubReg = NumSubRegs == 1 + ? Register(ValueReg) + : getSubReg(ValueReg, getSubRegFromChannel(i)); unsigned SOffsetRegState = 0; unsigned SrcDstRegState = getDefRegState(!IsStore); @@ -718,7 +696,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, SrcDstRegState |= getKillRegState(IsKill); } - auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill); + auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill); if (!MIB.getInstr()) { unsigned FinalReg = SubReg; @@ -743,6 +721,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(NewMMO); if (!IsStore && TmpReg != AMDGPU::NoRegister) @@ -763,22 +742,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, } } -static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, - bool Store) { - if (SuperRegSize % 16 == 0) { - return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; - } - - if (SuperRegSize % 8 == 0) { - return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; - } - - return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : - AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; -} - bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, @@ -794,98 +757,37 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (OnlyToVGPR && !SpillToVGPR) return false; - MachineRegisterInfo &MRI = MF->getRegInfo(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned SuperReg = MI->getOperand(0).getReg(); + Register SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - bool SpillToSMEM = spillSGPRToSMEM(); - if (SpillToSMEM && OnlyToVGPR) - return false; - - Register FrameReg = getFrameRegister(*MF); - assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && SuperReg != MFI->getFrameOffsetReg() && SuperReg != MFI->getScratchWaveOffsetReg())); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; - if (SpillToSMEM) { - if (RS->isRegUsed(AMDGPU::M0)) { - M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) - .addReg(AMDGPU::M0); - } - } - - unsigned ScalarStoreOp; unsigned EltSize = 4; const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - if (SpillToSMEM && isSGPRClass(RC)) { - // XXX - if private_element_size is larger than 4 it might be useful to be - // able to spill wider vmem spills. - std::tie(EltSize, ScalarStoreOp) = - getSpillEltSize(getRegSizeInBits(*RC) / 8, true); - } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + // Scavenged temporary VGPR to use. It must be scavenged once for any number + // of spilled subregs. + Register TmpVGPR; + // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = NumSubRegs == 1 ? - SuperReg : getSubReg(SuperReg, SplitParts[i]); - - if (SpillToSMEM) { - int64_t FrOffset = FrameInfo.getObjectOffset(Index); - - // The allocated memory size is really the wavefront size * the frame - // index size. The widest register class is 64 bytes, so a 4-byte scratch - // allocation is enough to spill this in a single stack object. - // - // FIXME: Frame size/offsets are computed earlier than this, so the extra - // space is still unnecessarily allocated. - - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, - EltSize, MinAlign(Align, EltSize * i)); - - // SMEM instructions only support a single offset, so increment the wave - // offset. - - int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); - if (Offset != 0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(FrameReg) - .addImm(Offset); - } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(FrameReg); - } - - BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) - .addReg(SubReg, getKillRegState(IsKill)) // sdata - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg, RegState::Kill) // soff - .addImm(0) // glc - .addImm(0) // dlc - .addMemOperand(MMO); - - continue; - } + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; @@ -915,15 +817,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, return false; // Spill SGPR to a frame index. - // TODO: Should VI try to spill to VGPR and then spill to SMEM? - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - // TODO: Should VI try to spill to VGPR and then spill to SMEM? + if (!TmpVGPR.isValid()) + TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); MachineInstrBuilder Mov - = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(SubReg, SubKillState); - // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (NumSubRegs > 1) { @@ -941,7 +841,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpReg, RegState::Kill) // src + .addReg(TmpVGPR, RegState::Kill) // src .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srrsrc .addReg(MFI->getStackPtrOffsetReg()) // soffset @@ -965,7 +865,6 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, RegScavenger *RS, bool OnlyToVGPR) const { MachineFunction *MF = MI->getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); @@ -976,84 +875,27 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, return false; MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); - unsigned SuperReg = MI->getOperand(0).getReg(); - bool SpillToSMEM = spillSGPRToSMEM(); - if (SpillToSMEM && OnlyToVGPR) - return false; + Register SuperReg = MI->getOperand(0).getReg(); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; - if (SpillToSMEM) { - if (RS->isRegUsed(AMDGPU::M0)) { - M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) - .addReg(AMDGPU::M0); - } - } - unsigned EltSize = 4; - unsigned ScalarLoadOp; - - Register FrameReg = getFrameRegister(*MF); const TargetRegisterClass *RC = getPhysRegClass(SuperReg); - if (SpillToSMEM && isSGPRClass(RC)) { - // XXX - if private_element_size is larger than 4 it might be useful to be - // able to spill wider vmem spills. - std::tie(EltSize, ScalarLoadOp) = - getSpillEltSize(getRegSizeInBits(*RC) / 8, false); - } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - // SubReg carries the "Kill" flag when SubReg == SuperReg. - int64_t FrOffset = FrameInfo.getObjectOffset(Index); + Register TmpVGPR; for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = NumSubRegs == 1 ? - SuperReg : getSubReg(SuperReg, SplitParts[i]); - - if (SpillToSMEM) { - // FIXME: Size may be > 4 but extra bytes wasted. - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, - EltSize, MinAlign(Align, EltSize * i)); - - // Add i * 4 offset - int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); - if (Offset != 0) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) - .addReg(FrameReg) - .addImm(Offset); - } else { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) - .addReg(FrameReg); - } - - auto MIB = - BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) - .addReg(MFI->getScratchRSrcReg()) // sbase - .addReg(OffsetReg, RegState::Kill) // soff - .addImm(0) // glc - .addImm(0) // dlc - .addMemOperand(MMO); - - if (NumSubRegs > 1 && i == 0) - MIB.addReg(SuperReg, RegState::ImplicitDefine); - - continue; - } + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; @@ -1071,7 +913,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (!TmpVGPR.isValid()) + TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo @@ -1081,7 +924,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, MachineMemOperand::MOLoad, EltSize, MinAlign(Align, EltSize * i)); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srsrc .addReg(MFI->getStackPtrOffsetReg()) // soffset @@ -1090,7 +933,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) - .addReg(TmpReg, RegState::Kill); + .addReg(TmpVGPR, RegState::Kill); if (NumSubRegs > 1) MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); @@ -1141,11 +984,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -1255,13 +1096,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // In an entry function/kernel the offset is already the absolute // address relative to the frame register. - unsigned DiffReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register TmpDiffReg = + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + + // If there's no free SGPR, in-place modify the FP + Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg; bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; Register ResultReg = IsCopy ? MI->getOperand(0).getReg() : - MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) .addReg(FrameReg) @@ -1271,35 +1115,80 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (Offset == 0) { // XXX - This never happens because of emergency scavenging slot at 0? BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) - .addImm(Log2_32(ST.getWavefrontSize())) + .addImm(ST.getWavefrontSizeLog2()) .addReg(DiffReg); } else { - unsigned ScaledReg - = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) - .addImm(Log2_32(ST.getWavefrontSize())) - .addReg(DiffReg, RegState::Kill); - - // TODO: Fold if use instruction is another add of a constant. - if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { - TII->getAddNoCarry(*MBB, MI, DL, ResultReg) - .addImm(Offset) - .addReg(ScaledReg, RegState::Kill) - .addImm(0); // clamp bit + if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { + Register ScaledReg = + RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0); + + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), + ScaledReg) + .addImm(ST.getWavefrontSizeLog2()) + .addReg(DiffReg, RegState::Kill); + + const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; + + // TODO: Fold if use instruction is another add of a constant. + if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { + // FIXME: This can fail + MIB.addImm(Offset); + MIB.addReg(ScaledReg, RegState::Kill); + if (!IsVOP2) + MIB.addImm(0); // clamp bit + } else { + Register ConstOffsetReg = + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false); + + // This should always be able to use the unused carry out. + assert(ConstOffsetReg && "this scavenge should not be able to fail"); + + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) + .addImm(Offset); + MIB.addReg(ConstOffsetReg, RegState::Kill); + MIB.addReg(ScaledReg, RegState::Kill); + MIB.addImm(0); // clamp bit + } } else { - unsigned ConstOffsetReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) - .addImm(Offset); - TII->getAddNoCarry(*MBB, MI, DL, ResultReg) - .addReg(ConstOffsetReg, RegState::Kill) + // We have to produce a carry out, and we there isn't a free SGPR + // pair for it. We can keep the whole computation on the SALU to + // avoid clobbering an additional register at the cost of an extra + // mov. + + // We may have 1 free scratch SGPR even though a carry out is + // unavailable. Only one additional mov is needed. + Register TmpScaledReg = + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg; + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) + .addReg(DiffReg, RegState::Kill) + .addImm(ST.getWavefrontSizeLog2()); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) .addReg(ScaledReg, RegState::Kill) - .addImm(0); // clamp bit + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) + .addReg(ScaledReg, RegState::Kill); + + // If there were truly no free SGPRs, we need to undo everything. + if (!TmpScaledReg.isValid()) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg) + .addReg(ScaledReg, RegState::Kill) + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) + .addReg(DiffReg, RegState::Kill) + .addImm(ST.getWavefrontSizeLog2()); + } } } + if (!TmpDiffReg.isValid()) { + // Restore the FP. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg) + .addReg(FrameReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } + // Don't introduce an extra copy if we're just materializing in a mov. if (IsCopy) MI->eraseFromParent(); @@ -1325,7 +1214,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int64_t NewOffset = OldImm + Offset; if (isUInt<12>(NewOffset) && - buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { + buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { MI->eraseFromParent(); return; } @@ -1337,7 +1226,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); @@ -1347,27 +1236,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { - const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg); - unsigned Size = getRegSizeInBits(*RC); - unsigned AltName = AMDGPU::NoRegAltName; - - switch (Size) { - case 32: AltName = AMDGPU::Reg32; break; - case 64: AltName = AMDGPU::Reg64; break; - case 96: AltName = AMDGPU::Reg96; break; - case 128: AltName = AMDGPU::Reg128; break; - case 160: AltName = AMDGPU::Reg160; break; - case 256: AltName = AMDGPU::Reg256; break; - case 512: AltName = AMDGPU::Reg512; break; - case 1024: AltName = AMDGPU::Reg1024; break; - } - return AMDGPUInstPrinter::getRegisterName(Reg, AltName); + return AMDGPUInstPrinter::getRegisterName(Reg); } // FIXME: This is very slow. It might be worth creating a map from physreg to // register class. const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { - assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + assert(!Register::isVirtualRegister(Reg)); static const TargetRegisterClass *const BaseClasses[] = { &AMDGPU::VGPR_32RegClass, @@ -1408,8 +1283,6 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { unsigned Size = getRegSizeInBits(*RC); - if (Size < 32) - return false; switch (Size) { case 32: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; @@ -1427,8 +1300,11 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; case 1024: return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; + case 1: + return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr; default: - llvm_unreachable("Invalid register class size"); + assert(Size < 32 && "Invalid register class size"); + return false; } } @@ -1476,6 +1352,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( return &AMDGPU::VReg_512RegClass; case 1024: return &AMDGPU::VReg_1024RegClass; + case 1: + return &AMDGPU::VReg_1RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1509,7 +1387,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( case 96: return &AMDGPU::SReg_96RegClass; case 128: - return &AMDGPU::SReg_128RegClass; + return &AMDGPU::SGPR_128RegClass; case 160: return &AMDGPU::SReg_160RegClass; case 256: @@ -1539,7 +1417,7 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( case 3: return &AMDGPU::SReg_96RegClass; case 4: - return &AMDGPU::SReg_128RegClass; + return &AMDGPU::SGPR_128RegClass; case 5: return &AMDGPU::SReg_160RegClass; case 8: @@ -1587,6 +1465,15 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( } } +bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { + if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && + OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) + return !ST.hasMFMAInlineLiteralBug(); + + return OpType >= AMDGPU::OPERAND_SRC_FIRST && + OpType <= AMDGPU::OPERAND_SRC_LAST; +} + bool SIRegisterInfo::shouldRewriteCopySrc( const TargetRegisterClass *DefRC, unsigned DefSubReg, @@ -1802,7 +1689,7 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC const TargetRegisterClass* SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, unsigned Reg) const { - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return MRI.getRegClass(Reg); return getPhysRegClass(Reg); @@ -1845,8 +1732,6 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), @@ -1900,18 +1785,22 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; case AMDGPU::SGPRRegBankID: - return &AMDGPU::SReg_32_XM0RegClass; + return &AMDGPU::SReg_32RegClass; case AMDGPU::SCCRegBankID: // This needs to return an allocatable class, so don't bother returning // the dummy SCC class. - return &AMDGPU::SReg_32_XM0RegClass; + // + // FIXME: This is a grotesque hack. We use SGPR_32 as an indication this + // was not an VCC bank value since we use the larger class SReg_32 for + // other values. These should all use SReg_32. + return &AMDGPU::SGPR_32RegClass; default: llvm_unreachable("unknown register bank"); } } case 32: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : - &AMDGPU::SReg_32_XM0RegClass; + &AMDGPU::SReg_32RegClass; case 64: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : &AMDGPU::SReg_64_XEXECRegClass; @@ -1920,7 +1809,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, &AMDGPU::SReg_96RegClass; case 128: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : - &AMDGPU::SReg_128RegClass; + &AMDGPU::SGPR_128RegClass; case 160: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : &AMDGPU::SReg_160RegClass; @@ -1930,10 +1819,13 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, case 512: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : &AMDGPU::SReg_512RegClass; + case 1024: + return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass : + &AMDGPU::SReg_1024RegClass; default: if (Size < 32) return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : - &AMDGPU::SReg_32_XM0RegClass; + &AMDGPU::SReg_32RegClass; return nullptr; } } @@ -1941,9 +1833,12 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, const TargetRegisterClass * SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const { - if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg())) + const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); + if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); - return nullptr; + + const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>(); + return getAllocatableClass(RC); } unsigned SIRegisterInfo::getVCC() const { @@ -1974,7 +1869,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, SlotIndex UseIdx = LIS->getInstructionIndex(Use); SlotIndex DefIdx; - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { if (!LIS->hasInterval(Reg)) return nullptr; LiveInterval &LI = LIS->getInterval(Reg); diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 34487c96e72e..ac3dea1a1a28 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -27,6 +27,7 @@ class SIMachineFunctionInfo; class SIRegisterInfo final : public AMDGPURegisterInfo { private: + const GCNSubtarget &ST; unsigned SGPRSetID; unsigned VGPRSetID; unsigned AGPRSetID; @@ -34,7 +35,6 @@ private: BitVector VGPRPressureSets; BitVector AGPRPressureSets; bool SpillSGPRToVGPR; - bool SpillSGPRToSMEM; bool isWave32; void classifyPressureSet(unsigned PSetID, unsigned Reg, @@ -46,10 +46,6 @@ public: return SpillSGPRToVGPR; } - bool spillSGPRToSMEM() const { - return SpillSGPRToSMEM; - } - /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; @@ -141,7 +137,7 @@ public: bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { const TargetRegisterClass *RC; - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) RC = MRI.getRegClass(Reg); else RC = getPhysRegClass(Reg); @@ -193,10 +189,7 @@ public: /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. - bool opCanUseInlineConstant(unsigned OpType) const { - return OpType >= AMDGPU::OPERAND_SRC_FIRST && - OpType <= AMDGPU::OPERAND_SRC_LAST; - } + bool opCanUseInlineConstant(unsigned OpType) const; unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, @@ -270,7 +263,7 @@ public: const MachineRegisterInfo &MRI) const override; const TargetRegisterClass *getBoolRC() const { - return isWave32 ? &AMDGPU::SReg_32_XM0RegClass + return isWave32 ? &AMDGPU::SReg_32RegClass : &AMDGPU::SReg_64RegClass; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index d5948a7862cc..82219cbdf3b2 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -37,50 +37,52 @@ class getSubRegs<int size> { !if(!eq(size, 16), ret16, ret32)))))); } -let Namespace = "AMDGPU" in { -defset list<RegAltNameIndex> AllRegAltNameIndices = { - def Reg32 : RegAltNameIndex; - def Reg64 : RegAltNameIndex; - def Reg96 : RegAltNameIndex; - def Reg128 : RegAltNameIndex; - def Reg160 : RegAltNameIndex; - def Reg256 : RegAltNameIndex; - def Reg512 : RegAltNameIndex; - def Reg1024 : RegAltNameIndex; -} -} +// Generates list of sequential register tuple names. +// E.g. RegSeq<3,2,2,"s">.ret -> [ "s[0:1]", "s[2:3]" ] +class RegSeqNames<int last_reg, int stride, int size, string prefix, + int start = 0> { + int next = !add(start, stride); + int end_reg = !add(!add(start, size), -1); + list<string> ret = + !if(!le(end_reg, last_reg), + !listconcat([prefix # "[" # start # ":" # end_reg # "]"], + RegSeqNames<last_reg, stride, size, prefix, next>.ret), + []); +} + +// Generates list of dags for register tupless. +class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size, + int start = 0> { + dag trunc_rc = (trunc RC, + !if(!and(!eq(stride, 1), !eq(start, 0)), + !add(!add(last_reg, 2), !mul(size, -1)), + !add(last_reg, 1))); + list<dag> ret = + !if(!lt(start, size), + !listconcat([(add (decimate (shl trunc_rc, start), stride))], + RegSeqDags<RC, last_reg, stride, size, !add(start, 1)>.ret), + []); +} + +class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC, + int last_reg, int stride, int size, string prefix> : + RegisterTuples<Indices, + RegSeqDags<RC, last_reg, stride, size>.ret, + RegSeqNames<last_reg, stride, size, prefix>.ret>; //===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// -class SIReg <string n, bits<16> regIdx = 0, string prefix = "", - int regNo = !cast<int>(regIdx)> : - Register<n, !if(!eq(prefix, ""), - [ n, n, n, n, n, n, n, n ], - [ prefix # regNo, - prefix # "[" # regNo # ":" # !and(!add(regNo, 1), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 2), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 3), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 4), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 7), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 15), 255) # "]", - prefix # "[" # regNo # ":" # !and(!add(regNo, 31), 255) # "]", - ])>, +class SIReg <string n, bits<16> regIdx = 0> : + Register<n>, DwarfRegNum<[!cast<int>(HWEncoding)]> { let Namespace = "AMDGPU"; - let RegAltNameIndices = AllRegAltNameIndices; // This is the not yet the complete register encoding. An additional // bit is set for VGPRs. let HWEncoding = regIdx; } -class SIRegisterWithSubRegs<string n, list<Register> subregs> : - RegisterWithSubRegs<n, subregs> { - let RegAltNameIndices = AllRegAltNameIndices; - let AltNames = [ n, n, n, n, n, n, n, n ]; -} - // Special Registers def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; @@ -93,7 +95,7 @@ def SP_REG : SIReg<"sp", 0>; def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>; // VCC for 64-bit instructions -def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, DwarfRegAlias<VCC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -103,7 +105,7 @@ def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, def EXEC_LO : SIReg<"exec_lo", 126>; def EXEC_HI : SIReg<"exec_hi", 127>; -def EXEC : SIRegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, +def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegAlias<EXEC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -134,7 +136,7 @@ def LDS_DIRECT : SIReg <"src_lds_direct", 254>; def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>; def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>; -def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>, +def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>, DwarfRegAlias<XNACK_MASK_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -145,7 +147,7 @@ def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_ def TBA_LO : SIReg<"tba_lo", 108>; def TBA_HI : SIReg<"tba_hi", 109>; -def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, +def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, DwarfRegAlias<TBA_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -155,7 +157,7 @@ def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, def TMA_LO : SIReg<"tma_lo", 110>; def TMA_HI : SIReg<"tma_hi", 111>; -def TMA : SIRegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, +def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, DwarfRegAlias<TMA_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -175,7 +177,7 @@ multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { } class FlatReg <Register lo, Register hi, bits<16> encoding> : - SIRegisterWithSubRegs<"flat_scratch", [lo, hi]>, + RegisterWithSubRegs<"flat_scratch", [lo, hi]>, DwarfRegAlias<lo> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; @@ -191,19 +193,19 @@ def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>; // SGPR registers foreach Index = 0-105 in { - def SGPR#Index : SIReg <"SGPR"#Index, Index, "s">; + def SGPR#Index : SIReg <"s"#Index, Index>; } // VGPR registers foreach Index = 0-255 in { - def VGPR#Index : SIReg <"VGPR"#Index, Index, "v"> { + def VGPR#Index : SIReg <"v"#Index, Index> { let HWEncoding{8} = 1; } } // AccVGPR registers foreach Index = 0-255 in { - def AGPR#Index : SIReg <"AGPR"#Index, Index, "a"> { + def AGPR#Index : SIReg <"a"#Index, Index> { let HWEncoding{8} = 1; } } @@ -226,102 +228,32 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> { // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add (sequence "SGPR%u", 0, 105)), Reg32> { + (add (sequence "SGPR%u", 0, 105))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. let AllocationPriority = 9; } // SGPR 64-bit registers -def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret, - [(add (decimate SGPR_32, 2)), - (add (decimate (shl SGPR_32, 1), 2))]>; +def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">; // SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs. -def SGPR_96Regs : RegisterTuples<getSubRegs<3>.ret, - [(add (decimate SGPR_32, 3)), - (add (decimate (shl SGPR_32, 1), 3)), - (add (decimate (shl SGPR_32, 2), 3))]>; +def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 3, 3, "s">; // SGPR 128-bit registers -def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret, - [(add (decimate SGPR_32, 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4))]>; +def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">; // SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs. -def SGPR_160Regs : RegisterTuples<getSubRegs<5>.ret, - [(add (decimate SGPR_32, 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4))]>; +def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">; // SGPR 256-bit registers -def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret, - [(add (decimate SGPR_32, 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4)), - (add (decimate (shl SGPR_32, 5), 4)), - (add (decimate (shl SGPR_32, 6), 4)), - (add (decimate (shl SGPR_32, 7), 4))]>; +def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">; // SGPR 512-bit registers -def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret, - [(add (decimate SGPR_32, 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4)), - (add (decimate (shl SGPR_32, 5), 4)), - (add (decimate (shl SGPR_32, 6), 4)), - (add (decimate (shl SGPR_32, 7), 4)), - (add (decimate (shl SGPR_32, 8), 4)), - (add (decimate (shl SGPR_32, 9), 4)), - (add (decimate (shl SGPR_32, 10), 4)), - (add (decimate (shl SGPR_32, 11), 4)), - (add (decimate (shl SGPR_32, 12), 4)), - (add (decimate (shl SGPR_32, 13), 4)), - (add (decimate (shl SGPR_32, 14), 4)), - (add (decimate (shl SGPR_32, 15), 4))]>; +def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s">; // SGPR 1024-bit registers -def SGPR_1024Regs : RegisterTuples<getSubRegs<32>.ret, - [(add (decimate SGPR_32, 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4)), - (add (decimate (shl SGPR_32, 5), 4)), - (add (decimate (shl SGPR_32, 6), 4)), - (add (decimate (shl SGPR_32, 7), 4)), - (add (decimate (shl SGPR_32, 8), 4)), - (add (decimate (shl SGPR_32, 9), 4)), - (add (decimate (shl SGPR_32, 10), 4)), - (add (decimate (shl SGPR_32, 11), 4)), - (add (decimate (shl SGPR_32, 12), 4)), - (add (decimate (shl SGPR_32, 13), 4)), - (add (decimate (shl SGPR_32, 14), 4)), - (add (decimate (shl SGPR_32, 15), 4)), - (add (decimate (shl SGPR_32, 16), 4)), - (add (decimate (shl SGPR_32, 17), 4)), - (add (decimate (shl SGPR_32, 18), 4)), - (add (decimate (shl SGPR_32, 19), 4)), - (add (decimate (shl SGPR_32, 20), 4)), - (add (decimate (shl SGPR_32, 21), 4)), - (add (decimate (shl SGPR_32, 22), 4)), - (add (decimate (shl SGPR_32, 23), 4)), - (add (decimate (shl SGPR_32, 24), 4)), - (add (decimate (shl SGPR_32, 25), 4)), - (add (decimate (shl SGPR_32, 26), 4)), - (add (decimate (shl SGPR_32, 27), 4)), - (add (decimate (shl SGPR_32, 28), 4)), - (add (decimate (shl SGPR_32, 29), 4)), - (add (decimate (shl SGPR_32, 30), 4)), - (add (decimate (shl SGPR_32, 31), 4))]>; +def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">; // Trap handler TMP 32-bit registers def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, @@ -330,51 +262,21 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, } // Trap handler TMP 64-bit registers -def TTMP_64Regs : RegisterTuples<getSubRegs<2>.ret, - [(add (decimate TTMP_32, 2)), - (add (decimate (shl TTMP_32, 1), 2))]>; +def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">; // Trap handler TMP 128-bit registers -def TTMP_128Regs : RegisterTuples<getSubRegs<4>.ret, - [(add (decimate TTMP_32, 4)), - (add (decimate (shl TTMP_32, 1), 4)), - (add (decimate (shl TTMP_32, 2), 4)), - (add (decimate (shl TTMP_32, 3), 4))]>; - -def TTMP_256Regs : RegisterTuples<getSubRegs<8>.ret, - [(add (decimate TTMP_32, 4)), - (add (decimate (shl TTMP_32, 1), 4)), - (add (decimate (shl TTMP_32, 2), 4)), - (add (decimate (shl TTMP_32, 3), 4)), - (add (decimate (shl TTMP_32, 4), 4)), - (add (decimate (shl TTMP_32, 5), 4)), - (add (decimate (shl TTMP_32, 6), 4)), - (add (decimate (shl TTMP_32, 7), 4))]>; - -def TTMP_512Regs : RegisterTuples<getSubRegs<16>.ret, - [(add (decimate TTMP_32, 4)), - (add (decimate (shl TTMP_32, 1), 4)), - (add (decimate (shl TTMP_32, 2), 4)), - (add (decimate (shl TTMP_32, 3), 4)), - (add (decimate (shl TTMP_32, 4), 4)), - (add (decimate (shl TTMP_32, 5), 4)), - (add (decimate (shl TTMP_32, 6), 4)), - (add (decimate (shl TTMP_32, 7), 4)), - (add (decimate (shl TTMP_32, 8), 4)), - (add (decimate (shl TTMP_32, 9), 4)), - (add (decimate (shl TTMP_32, 10), 4)), - (add (decimate (shl TTMP_32, 11), 4)), - (add (decimate (shl TTMP_32, 12), 4)), - (add (decimate (shl TTMP_32, 13), 4)), - (add (decimate (shl TTMP_32, 14), 4)), - (add (decimate (shl TTMP_32, 15), 4))]>; +def TTMP_128Regs : SIRegisterTuples<getSubRegs<4>.ret, TTMP_32, 15, 4, 4, "ttmp">; + +def TTMP_256Regs : SIRegisterTuples<getSubRegs<8>.ret, TTMP_32, 15, 4, 8, "ttmp">; + +def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttmp">; class TmpRegTuplesBase<int index, int size, list<Register> subRegs, list<SubRegIndex> indices = getSubRegs<size>.ret, int index1 = !add(index, !add(size, -1)), string name = "ttmp["#index#":"#index1#"]"> : - SIRegisterWithSubRegs<name, subRegs> { + RegisterWithSubRegs<name, subRegs> { let HWEncoding = subRegs[0].HWEncoding; let SubRegIndices = indices; } @@ -448,196 +350,80 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10, TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>; +class RegisterTypes<list<ValueType> reg_types> { + list<ValueType> types = reg_types; +} + +def Reg16Types : RegisterTypes<[i16, f16]>; +def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>; + + // VGPR 32-bit registers // i16/f16 only on VI+ -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add (sequence "VGPR%u", 0, 255)), Reg32> { +def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, + (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; } // VGPR 64-bit registers -def VGPR_64 : RegisterTuples<getSubRegs<2>.ret, - [(add (trunc VGPR_32, 255)), - (add (shl VGPR_32, 1))]>; +def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 255, 1, 2, "v">; // VGPR 96-bit registers -def VGPR_96 : RegisterTuples<getSubRegs<3>.ret, - [(add (trunc VGPR_32, 254)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2))]>; +def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 255, 1, 3, "v">; // VGPR 128-bit registers -def VGPR_128 : RegisterTuples<getSubRegs<4>.ret, - [(add (trunc VGPR_32, 253)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3))]>; +def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">; // VGPR 160-bit registers -def VGPR_160 : RegisterTuples<getSubRegs<5>.ret, - [(add (trunc VGPR_32, 252)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4))]>; +def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">; // VGPR 256-bit registers -def VGPR_256 : RegisterTuples<getSubRegs<8>.ret, - [(add (trunc VGPR_32, 249)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4)), - (add (shl VGPR_32, 5)), - (add (shl VGPR_32, 6)), - (add (shl VGPR_32, 7))]>; +def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">; // VGPR 512-bit registers -def VGPR_512 : RegisterTuples<getSubRegs<16>.ret, - [(add (trunc VGPR_32, 241)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4)), - (add (shl VGPR_32, 5)), - (add (shl VGPR_32, 6)), - (add (shl VGPR_32, 7)), - (add (shl VGPR_32, 8)), - (add (shl VGPR_32, 9)), - (add (shl VGPR_32, 10)), - (add (shl VGPR_32, 11)), - (add (shl VGPR_32, 12)), - (add (shl VGPR_32, 13)), - (add (shl VGPR_32, 14)), - (add (shl VGPR_32, 15))]>; +def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">; // VGPR 1024-bit registers -def VGPR_1024 : RegisterTuples<getSubRegs<32>.ret, - [(add (trunc VGPR_32, 225)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4)), - (add (shl VGPR_32, 5)), - (add (shl VGPR_32, 6)), - (add (shl VGPR_32, 7)), - (add (shl VGPR_32, 8)), - (add (shl VGPR_32, 9)), - (add (shl VGPR_32, 10)), - (add (shl VGPR_32, 11)), - (add (shl VGPR_32, 12)), - (add (shl VGPR_32, 13)), - (add (shl VGPR_32, 14)), - (add (shl VGPR_32, 15)), - (add (shl VGPR_32, 16)), - (add (shl VGPR_32, 17)), - (add (shl VGPR_32, 18)), - (add (shl VGPR_32, 19)), - (add (shl VGPR_32, 20)), - (add (shl VGPR_32, 21)), - (add (shl VGPR_32, 22)), - (add (shl VGPR_32, 23)), - (add (shl VGPR_32, 24)), - (add (shl VGPR_32, 25)), - (add (shl VGPR_32, 26)), - (add (shl VGPR_32, 27)), - (add (shl VGPR_32, 28)), - (add (shl VGPR_32, 29)), - (add (shl VGPR_32, 30)), - (add (shl VGPR_32, 31))]>; +def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">; // AccVGPR 32-bit registers def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add (sequence "AGPR%u", 0, 255)), Reg32> { + (add (sequence "AGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; } // AGPR 64-bit registers -def AGPR_64 : RegisterTuples<getSubRegs<2>.ret, - [(add (trunc AGPR_32, 255)), - (add (shl AGPR_32, 1))]>; +def AGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, AGPR_32, 255, 1, 2, "a">; // AGPR 128-bit registers -def AGPR_128 : RegisterTuples<getSubRegs<4>.ret, - [(add (trunc AGPR_32, 253)), - (add (shl AGPR_32, 1)), - (add (shl AGPR_32, 2)), - (add (shl AGPR_32, 3))]>; +def AGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, AGPR_32, 255, 1, 4, "a">; // AGPR 512-bit registers -def AGPR_512 : RegisterTuples<getSubRegs<16>.ret, - [(add (trunc AGPR_32, 241)), - (add (shl AGPR_32, 1)), - (add (shl AGPR_32, 2)), - (add (shl AGPR_32, 3)), - (add (shl AGPR_32, 4)), - (add (shl AGPR_32, 5)), - (add (shl AGPR_32, 6)), - (add (shl AGPR_32, 7)), - (add (shl AGPR_32, 8)), - (add (shl AGPR_32, 9)), - (add (shl AGPR_32, 10)), - (add (shl AGPR_32, 11)), - (add (shl AGPR_32, 12)), - (add (shl AGPR_32, 13)), - (add (shl AGPR_32, 14)), - (add (shl AGPR_32, 15))]>; +def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">; // AGPR 1024-bit registers -def AGPR_1024 : RegisterTuples<getSubRegs<32>.ret, - [(add (trunc AGPR_32, 225)), - (add (shl AGPR_32, 1)), - (add (shl AGPR_32, 2)), - (add (shl AGPR_32, 3)), - (add (shl AGPR_32, 4)), - (add (shl AGPR_32, 5)), - (add (shl AGPR_32, 6)), - (add (shl AGPR_32, 7)), - (add (shl AGPR_32, 8)), - (add (shl AGPR_32, 9)), - (add (shl AGPR_32, 10)), - (add (shl AGPR_32, 11)), - (add (shl AGPR_32, 12)), - (add (shl AGPR_32, 13)), - (add (shl AGPR_32, 14)), - (add (shl AGPR_32, 15)), - (add (shl AGPR_32, 16)), - (add (shl AGPR_32, 17)), - (add (shl AGPR_32, 18)), - (add (shl AGPR_32, 19)), - (add (shl AGPR_32, 20)), - (add (shl AGPR_32, 21)), - (add (shl AGPR_32, 22)), - (add (shl AGPR_32, 23)), - (add (shl AGPR_32, 24)), - (add (shl AGPR_32, 25)), - (add (shl AGPR_32, 26)), - (add (shl AGPR_32, 27)), - (add (shl AGPR_32, 28)), - (add (shl AGPR_32, 29)), - (add (shl AGPR_32, 30)), - (add (shl AGPR_32, 31))]>; +def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">; //===----------------------------------------------------------------------===// // Register classes used as source and destination //===----------------------------------------------------------------------===// def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG), Reg32> { + (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { let isAllocatable = 0; let CopyCost = -1; } def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, - (add PRIVATE_RSRC_REG), Reg128> { + (add PRIVATE_RSRC_REG)> { let isAllocatable = 0; let CopyCost = -1; } def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add LDS_DIRECT), Reg32> { + (add LDS_DIRECT)> { let isAllocatable = 0; let CopyCost = -1; } @@ -648,41 +434,40 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1 (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, - SRC_VCCZ, SRC_EXECZ, SRC_SCC), Reg32> { + SRC_VCCZ, SRC_EXECZ, SRC_SCC)> { let AllocationPriority = 10; } def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, - (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS), Reg32> { + (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { let AllocationPriority = 10; } def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, - (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI), Reg32> { + (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { let AllocationPriority = 10; } // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, - (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI), Reg32> { + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { let AllocationPriority = 10; } def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, - (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS), - Reg32> { + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS)> { let isAllocatable = 0; } def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, - (add SGPR_64Regs), Reg64> { + (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 11; } // CCR (call clobbered registers) SGPR 64-bit registers def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, - (add (trunc SGPR_64, 16)), Reg64> { + (add (trunc SGPR_64, 16))> { let CopyCost = SGPR_64.CopyCost; let AllocationPriority = SGPR_64.AllocationPriority; } @@ -693,13 +478,13 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, } def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, - (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA), Reg64> { + (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 13; } def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, - (add SReg_64_XEXEC, EXEC), Reg64> { + (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; let AllocationPriority = 13; } @@ -722,17 +507,17 @@ let CopyCost = 2 in { // There are no 3-component scalar instructions, but this is needed // for symmetry with VGPRs. def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, - (add SGPR_96Regs), Reg96> { + (add SGPR_96Regs)> { let AllocationPriority = 14; } def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, - (add SGPR_96), Reg96> { + (add SGPR_96)> { let AllocationPriority = 14; } def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, - (add SGPR_128Regs), Reg128> { + (add SGPR_128Regs)> { let AllocationPriority = 15; } @@ -742,8 +527,9 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, } def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add SGPR_128, TTMP_128), Reg128> { + (add SGPR_128, TTMP_128)> { let AllocationPriority = 15; + let isAllocatable = 0; } } // End CopyCost = 2 @@ -751,17 +537,16 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, // There are no 5-component scalar instructions, but this is needed // for symmetry with VGPRs. def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, - (add SGPR_160Regs), Reg160> { + (add SGPR_160Regs)> { let AllocationPriority = 16; } def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, - (add SGPR_160), Reg160> { + (add SGPR_160)> { let AllocationPriority = 16; } -def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs), - Reg256> { +def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { let AllocationPriority = 17; } @@ -770,14 +555,14 @@ def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { } def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, - (add SGPR_256, TTMP_256), Reg256> { + (add SGPR_256, TTMP_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; let AllocationPriority = 17; } def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add SGPR_512Regs), Reg512> { + (add SGPR_512Regs)> { let AllocationPriority = 18; } @@ -787,31 +572,31 @@ def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, } def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add SGPR_512, TTMP_512), Reg512> { + (add SGPR_512, TTMP_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; let AllocationPriority = 18; } def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add VGPR_32, LDS_DIRECT_CLASS), Reg32> { + (add VGPR_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; } def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, - (add SGPR_1024Regs), Reg1024> { + (add SGPR_1024Regs)> { let AllocationPriority = 19; } def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, - (add SGPR_1024), Reg1024> { + (add SGPR_1024)> { let CopyCost = 16; let AllocationPriority = 19; } // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, - (add VGPR_64), Reg64> { +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], 32, + (add VGPR_64)> { let Size = 64; // Requires 2 v_mov_b32 to copy @@ -819,7 +604,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32 let AllocationPriority = 2; } -def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> { +def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> { let Size = 96; // Requires 3 v_mov_b32 to copy @@ -828,7 +613,7 @@ def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> } def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add VGPR_128), Reg128> { + (add VGPR_128)> { let Size = 128; // Requires 4 v_mov_b32 to copy @@ -837,7 +622,7 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, } def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, - (add VGPR_160), Reg160> { + (add VGPR_160)> { let Size = 160; // Requires 5 v_mov_b32 to copy @@ -846,28 +631,28 @@ def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, } def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, - (add VGPR_256), Reg256> { + (add VGPR_256)> { let Size = 256; let CopyCost = 8; let AllocationPriority = 6; } def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add VGPR_512), Reg512> { + (add VGPR_512)> { let Size = 512; let CopyCost = 16; let AllocationPriority = 7; } def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, - (add VGPR_1024), Reg1024> { + (add VGPR_1024)> { let Size = 1024; let CopyCost = 32; let AllocationPriority = 8; } def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, - (add AGPR_64), Reg64> { + (add AGPR_64)> { let Size = 64; let CopyCost = 5; @@ -875,7 +660,7 @@ def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32 } def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add AGPR_128), Reg128> { + (add AGPR_128)> { let Size = 128; // Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr @@ -884,40 +669,39 @@ def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, } def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add AGPR_512), Reg512> { + (add AGPR_512)> { let Size = 512; let CopyCost = 33; let AllocationPriority = 7; } def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, - (add AGPR_1024), Reg1024> { + (add AGPR_1024)> { let Size = 1024; let CopyCost = 65; let AllocationPriority = 8; } -def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32), Reg32> { - let Size = 32; +def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { + let Size = 1; } def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add VGPR_32, SReg_32, LDS_DIRECT_CLASS), Reg32> { + (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; } -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64), - Reg64> { +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { let isAllocatable = 0; } def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add AGPR_32, VGPR_32), Reg32> { + (add AGPR_32, VGPR_32)> { let isAllocatable = 0; } def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32, - (add AReg_64, VReg_64), Reg64> { + (add AReg_64, VReg_64)> { let isAllocatable = 0; } diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 7ee178149c7a..8afca2cdc325 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -77,8 +77,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, // Try to fold Src0 MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg()) { - unsigned Reg = Src0.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) { + Register Reg = Src0.getReg(); + if (Register::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) { MachineInstr *Def = MRI.getUniqueVRegDef(Reg); if (Def && Def->isMoveImmediate()) { MachineOperand &MovSrc = Def->getOperand(1); @@ -360,8 +360,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, } if (NewImm != 0) { - if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && - SrcReg->isReg()) { + if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) { MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); return true; @@ -394,12 +393,11 @@ static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, if (!MO.isReg()) continue; - if (TargetRegisterInfo::isPhysicalRegister(Reg) && - TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + if (Register::isPhysicalRegister(Reg) && + Register::isPhysicalRegister(MO.getReg())) { if (TRI.regsOverlap(Reg, MO.getReg())) return true; - } else if (MO.getReg() == Reg && - TargetRegisterInfo::isVirtualRegister(Reg)) { + } else if (MO.getReg() == Reg && Register::isVirtualRegister(Reg)) { LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & TRI.getSubRegIndexLaneMask(MO.getSubReg()); if (Overlap.any()) @@ -425,7 +423,7 @@ static TargetInstrInfo::RegSubRegPair getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I, const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { if (TRI.getRegSizeInBits(Reg, MRI) != 32) { - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + if (Register::isPhysicalRegister(Reg)) { Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); } else { LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub); @@ -459,13 +457,13 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || MovT.getOpcode() == AMDGPU::COPY); - unsigned T = MovT.getOperand(0).getReg(); + Register T = MovT.getOperand(0).getReg(); unsigned Tsub = MovT.getOperand(0).getSubReg(); MachineOperand &Xop = MovT.getOperand(1); if (!Xop.isReg()) return nullptr; - unsigned X = Xop.getReg(); + Register X = Xop.getReg(); unsigned Xsub = Xop.getSubReg(); unsigned Size = TII->getOpSize(MovT, 0) / 4; @@ -484,7 +482,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, MovY.getOperand(1).getSubReg() != Tsub) continue; - unsigned Y = MovY.getOperand(0).getReg(); + Register Y = MovY.getOperand(0).getReg(); unsigned Ysub = MovY.getOperand(0).getSubReg(); if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent()) @@ -579,7 +577,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // XXX - not exactly a check for post-regalloc run. MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && - TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) { + Register::isPhysicalRegister(MI.getOperand(0).getReg())) { int32_t ReverseImm; if (isReverseInlineImm(TII, Src, ReverseImm)) { MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); @@ -643,8 +641,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // FIXME: This could work better if hints worked with subregisters. If // we have a vector add of a constant, we usually don't get the correct // allocation due to the subregister usage. - if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && - Src0->isReg()) { + if (Register::isVirtualRegister(Dest->getReg()) && Src0->isReg()) { MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); continue; @@ -672,8 +669,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { const MachineOperand &Dst = MI.getOperand(0); MachineOperand &Src = MI.getOperand(1); - if (Src.isImm() && - TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) { + if (Src.isImm() && Register::isPhysicalRegister(Dst.getReg())) { int32_t ReverseImm; if (isKImmOperand(TII, Src)) MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); @@ -721,8 +717,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { - unsigned DstReg = MI.getOperand(0).getReg(); - if (TargetRegisterInfo::isVirtualRegister(DstReg)) { + Register DstReg = MI.getOperand(0).getReg(); + if (Register::isVirtualRegister(DstReg)) { // VOPC instructions can only write to the VCC register. We can't // force them to use VCC here, because this is only one register and // cannot deal with sequences which would require multiple copies of @@ -745,8 +741,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { TII->getNamedOperand(MI, AMDGPU::OpName::src2); if (!Src2->isReg()) continue; - unsigned SReg = Src2->getReg(); - if (TargetRegisterInfo::isVirtualRegister(SReg)) { + Register SReg = Src2->getReg(); + if (Register::isVirtualRegister(SReg)) { MRI.setRegAllocationHint(SReg, 0, VCCReg); continue; } @@ -766,7 +762,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { bool Next = false; if (SDst->getReg() != VCCReg) { - if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) + if (Register::isVirtualRegister(SDst->getReg())) MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg); Next = true; } @@ -774,7 +770,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // All of the instructions with carry outs also have an SGPR input in // src2. if (Src2 && Src2->getReg() != VCCReg) { - if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) + if (Register::isVirtualRegister(Src2->getReg())) MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); Next = true; } diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 4e07efff55d8..cb4cf68d709a 100644 --- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -273,12 +273,12 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, if (!Use.isReg() || !Use.isUse()) continue; - unsigned Reg = Use.getReg(); + Register Reg = Use.getReg(); // Handle physical registers that we need to track; this is mostly relevant // for VCC, which can appear as the (implicit) input of a uniform branch, // e.g. when a loop counter is stored in a VGPR. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!Register::isVirtualRegister(Reg)) { if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO) continue; @@ -312,6 +312,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, char GlobalFlags = 0; bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); SmallVector<MachineInstr *, 4> SetInactiveInstrs; + SmallVector<MachineInstr *, 4> SoftWQMInstrs; // We need to visit the basic blocks in reverse post-order so that we visit // defs before uses, in particular so that we don't accidentally mark an @@ -340,6 +341,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // correct, so we need it to be in WQM. Flags = StateWQM; LowerToCopyInstrs.push_back(&MI); + } else if (Opcode == AMDGPU::SOFT_WQM) { + LowerToCopyInstrs.push_back(&MI); + SoftWQMInstrs.push_back(&MI); + continue; } else if (Opcode == AMDGPU::WWM) { // The WWM intrinsic doesn't make the same guarantee, and plus it needs // to be executed in WQM or Exact so that its copy doesn't clobber @@ -356,8 +361,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, if (Inactive.isUndef()) { LowerToCopyInstrs.push_back(&MI); } else { - unsigned Reg = Inactive.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + Register Reg = Inactive.getReg(); + if (Register::isVirtualRegister(Reg)) { for (MachineInstr &DefMI : MRI->def_instructions(Reg)) markInstruction(DefMI, StateWWM, Worklist); } @@ -385,9 +390,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); - if (!TRI->isVirtualRegister(Reg) && + if (!Register::isVirtualRegister(Reg) && TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) { Flags = StateWQM; break; @@ -407,9 +412,12 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is // ever used anywhere in the function. This implements the corresponding // semantics of @llvm.amdgcn.set.inactive. + // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm. if (GlobalFlags & StateWQM) { for (MachineInstr *MI : SetInactiveInstrs) markInstruction(*MI, StateWQM, Worklist); + for (MachineInstr *MI : SoftWQMInstrs) + markInstruction(*MI, StateWQM, Worklist); } return GlobalFlags; @@ -548,7 +556,7 @@ bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const { MachineBasicBlock::iterator SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before) { - unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); MachineInstr *Save = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg) @@ -832,7 +840,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); - unsigned Dest = MI->getOperand(0).getReg(); + Register Dest = MI->getOperand(0).getReg(); MachineInstr *Copy = BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) .addReg(LiveMaskReg); @@ -847,13 +855,12 @@ void SIWholeQuadMode::lowerCopyInstrs() { for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) MI->RemoveOperand(i); - const unsigned Reg = MI->getOperand(0).getReg(); + const Register Reg = MI->getOperand(0).getReg(); if (TRI->isVGPR(*MRI, Reg)) { - const TargetRegisterClass *regClass = - TargetRegisterInfo::isVirtualRegister(Reg) - ? MRI->getRegClass(Reg) - : TRI->getPhysRegClass(Reg); + const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg) + ? MRI->getRegClass(Reg) + : TRI->getPhysRegClass(Reg); const unsigned MovOp = TII->getMovOpcode(regClass); MI->setDesc(TII->get(MovOp)); @@ -885,7 +892,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (!(GlobalFlags & StateWQM)) { lowerLiveMaskQueries(Exec); - if (!(GlobalFlags & StateWWM)) + if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty()) return !LiveMaskQueries.empty(); } else { // Store a copy of the original live mask when required diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 1b410b6b5912..1a74ebbf8165 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -793,9 +793,18 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> { // selector to prefer those. let AddedComplexity = 100 in { -defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; +foreach vt = Reg32Types.types in { +defm : SMRD_Pattern <"S_LOAD_DWORD", vt>; +} + +foreach vt = SReg_64.RegTypes in { +defm : SMRD_Pattern <"S_LOAD_DWORDX2", vt>; +} + +foreach vt = SReg_128.RegTypes in { +defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>; +} + defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index dfafdccc05a3..d31a49f428ee 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -181,7 +181,9 @@ def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (ctpop i32:$src0))] >; -def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">; +def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64", + [(set i32:$sdst, (ctpop i64:$src0))] +>; } // End Defs = [SCC] def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">; @@ -417,16 +419,16 @@ def S_SUBB_U32 : SOP2_32 <"s_subb_u32", let isCommutable = 1 in { def S_MIN_I32 : SOP2_32 <"s_min_i32", - [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))] + [(set i32:$sdst, (smin i32:$src0, i32:$src1))] >; def S_MIN_U32 : SOP2_32 <"s_min_u32", - [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))] + [(set i32:$sdst, (umin i32:$src0, i32:$src1))] >; def S_MAX_I32 : SOP2_32 <"s_max_i32", - [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))] + [(set i32:$sdst, (smax i32:$src0, i32:$src1))] >; def S_MAX_U32 : SOP2_32 <"s_max_u32", - [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))] + [(set i32:$sdst, (umax i32:$src0, i32:$src1))] >; } // End isCommutable = 1 } // End Defs = [SCC] @@ -853,13 +855,13 @@ class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1, let Defs = [SCC]; } class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, - string opName, PatLeaf cond> : SOPC_Base < + string opName, SDPatternOperator cond> : SOPC_Base < op, rc, rc, opName, [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > { } class SOPC_CMP_32<bits<7> op, string opName, - PatLeaf cond = COND_NULL, string revOp = opName> + SDPatternOperator cond = COND_NULL, string revOp = opName> : SOPC_Helper<op, SSrc_b32, i32, opName, cond>, Commutable_REV<revOp, !eq(revOp, opName)>, SOPKInstTable<0, opName> { @@ -868,7 +870,7 @@ class SOPC_CMP_32<bits<7> op, string opName, } class SOPC_CMP_64<bits<7> op, string opName, - PatLeaf cond = COND_NULL, string revOp = opName> + SDPatternOperator cond = COND_NULL, string revOp = opName> : SOPC_Helper<op, SSrc_b64, i64, opName, cond>, Commutable_REV<revOp, !eq(revOp, opName)> { let isCompare = 1; @@ -1076,8 +1078,6 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", [(int_amdgcn_s_barrier)]> { let SchedRW = [WriteBarrier]; let simm16 = 0; - let mayLoad = 1; - let mayStore = 1; let isConvergent = 1; } @@ -1090,7 +1090,7 @@ def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> { let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16", - [(int_amdgcn_s_waitcnt UIMM16bit:$simm16)]>; + [(int_amdgcn_s_waitcnt timm:$simm16)]>; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">; @@ -1099,7 +1099,7 @@ def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">; // maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the // maximum really 15 on VI? def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16), - "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> { + "s_sleep $simm16", [(int_amdgcn_s_sleep timm:$simm16)]> { let hasSideEffects = 1; let mayLoad = 1; let mayStore = 1; @@ -1110,12 +1110,11 @@ def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">; let Uses = [EXEC, M0] in { // FIXME: Should this be mayLoad+mayStore? def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", - [(AMDGPUsendmsg (i32 imm:$simm16))] ->; + [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]>; def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16", - [(AMDGPUsendmsghalt (i32 imm:$simm16))] ->; + [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]>; + } // End Uses = [EXEC, M0] def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> { @@ -1126,13 +1125,13 @@ def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { let simm16 = 0; } def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16", - [(int_amdgcn_s_incperflevel SIMM16bit:$simm16)]> { + [(int_amdgcn_s_incperflevel timm:$simm16)]> { let hasSideEffects = 1; let mayLoad = 1; let mayStore = 1; } def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16", - [(int_amdgcn_s_decperflevel SIMM16bit:$simm16)]> { + [(int_amdgcn_s_decperflevel timm:$simm16)]> { let hasSideEffects = 1; let mayLoad = 1; let mayStore = 1; @@ -1169,7 +1168,10 @@ let SubtargetPredicate = isGFX10Plus in { def S_ROUND_MODE : SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">; def S_DENORM_MODE : - SOPP<0x025, (ins s16imm:$simm16), "s_denorm_mode $simm16">; + SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16", + [(SIdenorm_mode (i32 timm:$simm16))]> { + let hasSideEffects = 1; + } def S_TTRACEDATA_IMM : SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">; } // End SubtargetPredicate = isGFX10Plus @@ -1178,7 +1180,7 @@ let SubtargetPredicate = isGFX10Plus in { // S_GETREG_B32 Intrinsic Pattern. //===----------------------------------------------------------------------===// def : GCNPat < - (int_amdgcn_s_getreg imm:$simm16), + (int_amdgcn_s_getreg timm:$simm16), (S_GETREG_B32 (as_i16imm $simm16)) >; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index e90f40e6abea..afb2fd987afd 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -131,29 +131,70 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) { struct MUBUFInfo { uint16_t Opcode; uint16_t BaseOpcode; - uint8_t dwords; + uint8_t elements; bool has_vaddr; bool has_srsrc; bool has_soffset; }; +struct MTBUFInfo { + uint16_t Opcode; + uint16_t BaseOpcode; + uint8_t elements; + bool has_vaddr; + bool has_srsrc; + bool has_soffset; +}; + +#define GET_MTBUFInfoTable_DECL +#define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" +int getMTBUFBaseOpcode(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFInfoFromOpcode(Opc); + return Info ? Info->BaseOpcode : -1; +} + +int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements) { + const MTBUFInfo *Info = getMTBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements); + return Info ? Info->Opcode : -1; +} + +int getMTBUFElements(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->elements : 0; +} + +bool getMTBUFHasVAddr(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_vaddr : false; +} + +bool getMTBUFHasSrsrc(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_srsrc : false; +} + +bool getMTBUFHasSoffset(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_soffset : false; +} + int getMUBUFBaseOpcode(unsigned Opc) { const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc); return Info ? Info->BaseOpcode : -1; } -int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) { - const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords); +int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements) { + const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements); return Info ? Info->Opcode : -1; } -int getMUBUFDwords(unsigned Opc) { +int getMUBUFElements(unsigned Opc) { const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); - return Info ? Info->dwords : 0; + return Info ? Info->elements : 0; } bool getMUBUFHasVAddr(unsigned Opc) { @@ -241,7 +282,7 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI, } unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) { - return getMaxWavesPerEU() * getEUsPerCU(STI); + return getMaxWavesPerEU(STI) * getEUsPerCU(STI); } unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI, @@ -253,9 +294,11 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) { return 1; } -unsigned getMaxWavesPerEU() { +unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) { // FIXME: Need to take scratch memory into account. - return 10; + if (!isGFX10(*STI)) + return 10; + return 20; } unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI, @@ -317,7 +360,7 @@ unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { if (Version.Major >= 10) return 0; - if (WavesPerEU >= getMaxWavesPerEU()) + if (WavesPerEU >= getMaxWavesPerEU(STI)) return 0; unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1); @@ -394,17 +437,19 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { - return 256; + if (!isGFX10(*STI)) + return 256; + return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { - return getTotalNumVGPRs(STI); + return 256; } unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { assert(WavesPerEU != 0); - if (WavesPerEU >= getMaxWavesPerEU()) + if (WavesPerEU >= getMaxWavesPerEU(STI)) return 0; unsigned MinNumVGPRs = alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1), @@ -510,7 +555,7 @@ bool isReadOnlySegment(const GlobalValue *GV) { } bool shouldEmitConstantsToTextSection(const Triple &TT) { - return TT.getOS() != Triple::AMDHSA; + return TT.getOS() == Triple::AMDPAL; } int getIntegerAttribute(const Function &F, StringRef Name, int Default) { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 209ef7eef749..f78dadd447ff 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -94,7 +94,7 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI); /// \returns Maximum number of waves per execution unit for given subtarget \p /// STI without any kind of limitation. -unsigned getMaxWavesPerEU(); +unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI); /// \returns Maximum number of waves per execution unit for given subtarget \p /// STI and limited by given \p FlatWorkGroupSize. @@ -264,13 +264,31 @@ LLVM_READONLY const MIMGInfo *getMIMGInfo(unsigned Opc); LLVM_READONLY +int getMTBUFBaseOpcode(unsigned Opc); + +LLVM_READONLY +int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements); + +LLVM_READONLY +int getMTBUFElements(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasVAddr(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasSrsrc(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasSoffset(unsigned Opc); + +LLVM_READONLY int getMUBUFBaseOpcode(unsigned Opc); LLVM_READONLY -int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords); +int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements); LLVM_READONLY -int getMUBUFDwords(unsigned Opc); +int getMUBUFElements(unsigned Opc); LLVM_READONLY bool getMUBUFHasVAddr(unsigned Opc); diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index db20d5ccf5f9..207e4232e829 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -21,6 +21,8 @@ #include "SIDefines.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/EndianStream.h" diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 6bc416ed7d4b..f1cdc3097dc0 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -104,9 +104,21 @@ multiclass VOP1Inst <string opName, VOPProfile P, SDPatternOperator node = null_frag> { def _e32 : VOP1_Pseudo <opName, P>; def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; - def _sdwa : VOP1_SDWA_Pseudo <opName, P>; + + foreach _ = BoolToList<P.HasExtSDWA>.ret in + def _sdwa : VOP1_SDWA_Pseudo <opName, P>; + foreach _ = BoolToList<P.HasExtDPP>.ret in def _dpp : VOP1_DPP_Pseudo <opName, P>; + + def : MnemonicAlias<opName#"_e32", opName>, LetDummies; + def : MnemonicAlias<opName#"_e64", opName>, LetDummies; + + foreach _ = BoolToList<P.HasExtSDWA>.ret in + def : MnemonicAlias<opName#"_sdwa", opName>, LetDummies; + + foreach _ = BoolToList<P.HasExtDPP>.ret in + def : MnemonicAlias<opName#"_dpp", opName>, LetDummies; } // Special profile for instructions which have clamp @@ -227,10 +239,10 @@ defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; } // End SchedRW = [WriteQuarterRate32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; -defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>; -defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>; +defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>; +defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>; defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>; -defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>; +defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>; let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>; @@ -434,7 +446,7 @@ let SubtargetPredicate = isGFX10Plus in { // Target-specific instruction encodings. //===----------------------------------------------------------------------===// -class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> : +class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> : VOP_DPP<ps.OpName, p, isDPP16> { let hasSideEffects = ps.hasSideEffects; let Defs = ps.Defs; @@ -448,8 +460,9 @@ class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = let Inst{31-25} = 0x3f; } -class VOP1_DPP16<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : - VOP1_DPP<op, ps, p, 1> { +class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> : + VOP1_DPP<op, ps, p, 1>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> { let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); let SubtargetPredicate = HasDPP16; } @@ -492,6 +505,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } multiclass VOP1_Real_sdwa_gfx10<bits<9> op> { + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae<op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { @@ -499,11 +513,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } } multiclass VOP1_Real_dpp_gfx10<bits<9> op> { - def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")> { let DecoderNamespace = "SDWA10"; } } multiclass VOP1_Real_dpp8_gfx10<bits<9> op> { + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { let DecoderNamespace = "DPP8"; } @@ -704,10 +720,12 @@ multiclass VOP1_Real_e32e64_vi <bits<10> op> { multiclass VOP1_Real_vi <bits<10> op> { defm NAME : VOP1_Real_e32e64_vi <op>; + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in def _sdwa_vi : VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; @@ -831,25 +849,25 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>; def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>; def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>; -let OtherPredicates = [isGFX8GFX9] in { +let OtherPredicates = [isGFX8Plus] in { def : GCNPat < - (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, - imm:$bound_ctrl)), + (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, + timm:$bound_ctrl)), (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) >; def : GCNPat < - (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask, - imm:$bank_mask, imm:$bound_ctrl)), + (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, timm:$row_mask, + timm:$bank_mask, timm:$bound_ctrl)), (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) >; -} // End OtherPredicates = [isGFX8GFX9] +} // End OtherPredicates = [isGFX8Plus] let OtherPredicates = [isGFX8Plus] in { def : GCNPat< @@ -885,6 +903,7 @@ multiclass VOP1_Real_gfx9 <bits<10> op> { defm NAME : VOP1_Real_e32e64_vi <op>; } + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; @@ -904,23 +923,7 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; let OtherPredicates = [isGFX10Plus] in { def : GCNPat < - (i32 (int_amdgcn_mov_dpp8 i32:$src, imm:$dpp8)), + (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)), (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0)) >; - -def : GCNPat < - (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, - imm:$bound_ctrl)), - (V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl), (i32 0)) ->; - -def : GCNPat < - (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask, - imm:$bank_mask, imm:$bound_ctrl)), - (V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl), (i32 0)) ->; } // End OtherPredicates = [isGFX10Plus] diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 1b30cd2ed516..1ab0fc1ab58d 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -147,7 +147,8 @@ multiclass VOP2Inst_sdwa<string opName, string revOp = opName, bit GFX9Renamed = 0> { let renamedInGFX9 = GFX9Renamed in { - def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + foreach _ = BoolToList<P.HasExtSDWA>.ret in + def _sdwa : VOP2_SDWA_Pseudo <opName, P>; } // End renamedInGFX9 = GFX9Renamed } @@ -179,9 +180,10 @@ multiclass VOP2bInst <string opName, let usesCustomInserter = !eq(P.NumSrcArgs, 2); } - def _sdwa : VOP2_SDWA_Pseudo <opName, P> { - let AsmMatchConverter = "cvtSdwaVOP2b"; - } + foreach _ = BoolToList<P.HasExtSDWA>.ret in + def _sdwa : VOP2_SDWA_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } foreach _ = BoolToList<P.HasExtDPP>.ret in def _dpp : VOP2_DPP_Pseudo <opName, P>; } @@ -220,9 +222,10 @@ multiclass VOP2eInst <string opName, def _e32 : VOP2_Pseudo <opName, P>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; - def _sdwa : VOP2_SDWA_Pseudo <opName, P> { - let AsmMatchConverter = "cvtSdwaVOP2b"; - } + foreach _ = BoolToList<P.HasExtSDWA>.ret in + def _sdwa : VOP2_SDWA_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2e"; + } foreach _ = BoolToList<P.HasExtDPP>.ret in def _dpp : VOP2_DPP_Pseudo <opName, P>; @@ -251,7 +254,9 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> { class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); - field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm); + field dag Ins32 = !if(!eq(vt.Size, 32), + (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm), + (ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm)); field bit HasExt = 0; // Hack to stop printing _e64 @@ -519,7 +524,7 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, } // End isConvergent = 1 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; -defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; +defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>; defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>; defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>; defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>; @@ -539,9 +544,9 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfma let SubtargetPredicate = isGFX6GFX7GFX10 in { let isCommutable = 1 in { defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; -defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>; -defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>; -defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>; } // End isCommutable = 1 } // End SubtargetPredicate = isGFX6GFX7GFX10 @@ -606,9 +611,9 @@ def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; } // End FPDPRounding = 1 -defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; -defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; -defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; +defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, lshl_rev>; +defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, lshr_rev>; +defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>; let isCommutable = 1 in { let FPDPRounding = 1 in { @@ -618,16 +623,16 @@ defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; } // End FPDPRounding = 1 -defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; -defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; +defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>; +defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; -defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>; +defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>; defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>; defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>; -defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>; -defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>; -defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>; -defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>; +defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16, umax>; +defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>; +defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>; +defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>; let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { @@ -653,16 +658,17 @@ defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>; let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1, - isCommutable = 1 in { + isCommutable = 1, + IsDOT = 1 in { let SubtargetPredicate = HasDot5Insts in - defm V_DOT2C_F32_F16 : VOP2Inst_e32<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>; + defm V_DOT2C_F32_F16 : VOP2Inst<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>; let SubtargetPredicate = HasDot6Insts in - defm V_DOT4C_I32_I8 : VOP2Inst_e32<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>; + defm V_DOT4C_I32_I8 : VOP2Inst<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>; let SubtargetPredicate = HasDot4Insts in - defm V_DOT2C_I32_I16 : VOP2Inst_e32<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>; + defm V_DOT2C_I32_I16 : VOP2Inst<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>; let SubtargetPredicate = HasDot3Insts in - defm V_DOT8C_I32_I4 : VOP2Inst_e32<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>; + defm V_DOT8C_I32_I4 : VOP2Inst<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>; } let AddedComplexity = 30 in { @@ -719,50 +725,17 @@ defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; // Note: 16-bit instructions produce a 0 result in the high 16-bits // on GFX8 and GFX9 and preserve high 16 bits on GFX10+ -def ClearHI16 : OutPatFrag<(ops node:$op), - (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>; - -multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst, - bit PreservesHI16 = 0> { - -def : GCNPat< - (op i16:$src0, i16:$src1), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) ->; - -def : GCNPat< - (i32 (zext (op i16:$src0, i16:$src1))), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)) ->; - -def : GCNPat< - (i64 (zext (op i16:$src0, i16:$src1))), - (REG_SEQUENCE VReg_64, - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)), - sub0, - (V_MOV_B32_e32 (i32 0)), sub1) ->; -} - -multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst, - bit PreservesHI16 = 0> { - -def : GCNPat< - (op i16:$src0, i16:$src1), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) ->; +multiclass Arithmetic_i16_0Hi_Pats <SDPatternOperator op, Instruction inst> { def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) + (inst $src0, $src1) >; - def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)), - sub0, + (inst $src0, $src1), sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; } @@ -774,53 +747,36 @@ class ZExt_i16_i1_Pat <SDNode ext> : GCNPat < $src) >; -let Predicates = [Has16BitInsts] in { - -let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { -defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>; -defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>; -defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>; -defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>; -defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>; -defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>; -defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>; -} - -let Predicates = [Has16BitInsts, isGFX10Plus] in { -defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64, 1>; -defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64, 1>; -defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64, 1>; -defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64, 1>; -defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64, 1>; -defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64, 1>; -defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64, 1>; -} - +foreach vt = [i16, v2i16] in { def : GCNPat < - (and i16:$src0, i16:$src1), - (V_AND_B32_e64 $src0, $src1) + (and vt:$src0, vt:$src1), + (V_AND_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1) >; def : GCNPat < - (or i16:$src0, i16:$src1), - (V_OR_B32_e64 $src0, $src1) + (or vt:$src0, vt:$src1), + (V_OR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1) >; def : GCNPat < - (xor i16:$src0, i16:$src1), - (V_XOR_B32_e64 $src0, $src1) + (xor vt:$src0, vt:$src1), + (V_XOR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1) >; - -let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { -defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>; -defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>; -defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>; } -let Predicates = [Has16BitInsts, isGFX10Plus] in { -defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64, 1>; -defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64, 1>; -defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64, 1>; +let Predicates = [Has16BitInsts] in { + +let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { +defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>; +defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>; +defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>; +defm : Arithmetic_i16_0Hi_Pats<smin, V_MIN_I16_e64>; +defm : Arithmetic_i16_0Hi_Pats<smax, V_MAX_I16_e64>; +defm : Arithmetic_i16_0Hi_Pats<umin, V_MIN_U16_e64>; +defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>; +defm : Arithmetic_i16_0Hi_Pats<lshl_rev, V_LSHLREV_B16_e64>; +defm : Arithmetic_i16_0Hi_Pats<lshr_rev, V_LSHRREV_B16_e64>; +defm : Arithmetic_i16_0Hi_Pats<ashr_rev, V_ASHRREV_I16_e64>; } def : ZExt_i16_i1_Pat<zext>; @@ -847,7 +803,7 @@ def : GCNPat< // Target-specific instruction encodings. //===----------------------------------------------------------------------===// -class VOP2_DPP<bits<6> op, VOP2_Pseudo ps, +class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps, string opName = ps.OpName, VOPProfile p = ps.Pfl, bit IsDPP16 = 0> : VOP_DPP<opName, p, IsDPP16> { @@ -865,13 +821,18 @@ class VOP2_DPP<bits<6> op, VOP2_Pseudo ps, let Inst{31} = 0x0; } -class VOP2_DPP16<bits<6> op, VOP2_Pseudo ps, +class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, string opName = ps.OpName, VOPProfile p = ps.Pfl> : VOP2_DPP<op, ps, opName, p, 1> { let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); let SubtargetPredicate = HasDPP16; } +class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, + string opName = ps.OpName, VOPProfile p = ps.Pfl> : + Base_VOP2_DPP16<op, ps, opName, p>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10>; + class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, string opName = ps.OpName, VOPProfile p = ps.Pfl> : VOP_DPP8<ps.OpName, p> { @@ -924,6 +885,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } multiclass VOP2_Real_sdwa_gfx10<bits<6> op> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { @@ -931,11 +893,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } } multiclass VOP2_Real_dpp_gfx10<bits<6> op> { - def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { let DecoderNamespace = "SDWA10"; } } multiclass VOP2_Real_dpp8_gfx10<bits<6> op> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { let DecoderNamespace = "DPP8"; } @@ -964,6 +928,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { let DecoderNamespace = "SDWA10" in { multiclass VOP2_Real_sdwa_gfx10_with_name<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { @@ -973,13 +938,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName, string asmName> { - def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32")> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp")> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP16; } } multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP8; @@ -989,13 +956,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } // End DecoderNamespace = "SDWA10" //===------------------------------ VOP2be ------------------------------===// - multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> { + multiclass VOP2be_Real_e32_gfx10<bits<6> op, string opName, string asmName> { def _e32_gfx10 : VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>, VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> { VOP2_Pseudo Ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); } + } + multiclass VOP2be_Real_e64_gfx10<bits<6> op, string opName, string asmName> { def _e64_gfx10 : VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>, VOP3be_gfx10<{0, 1, 0, 0, op{5-0}}, @@ -1003,6 +972,9 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64"); let AsmString = asmName # Ps.AsmOperands; } + } + multiclass VOP2be_Real_sdwa_gfx10<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { @@ -1010,64 +982,76 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); let DecoderNamespace = "SDWA10"; } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in + def _sdwa_w32_gfx10 : + Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, + VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); + let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands); + let isAsmParserOnly = 1; + let DecoderNamespace = "SDWA10"; + let WaveSizePredicate = isWave32; + } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in + def _sdwa_w64_gfx10 : + Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, + VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); + let AsmString = asmName # Ps.AsmOperands; + let isAsmParserOnly = 1; + let DecoderNamespace = "SDWA10"; + let WaveSizePredicate = isWave64; + } + } + multiclass VOP2be_Real_dpp_gfx10<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in def _dpp_gfx10 : - VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; let AsmString = asmName # !subst(", vcc", "", AsmDPP); let DecoderNamespace = "SDWA10"; } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_w32_gfx10 : + Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_w64_gfx10 : + Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; let AsmString = asmName # !subst(", vcc", "", AsmDPP8); let DecoderNamespace = "DPP8"; } - - let WaveSizePredicate = isWave32 in { - def _sdwa_w32_gfx10 : - Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, - VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { - VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); - let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands); - let isAsmParserOnly = 1; - let DecoderNamespace = "SDWA10"; - } - def _dpp_w32_gfx10 : - VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { - string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; - let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); - let isAsmParserOnly = 1; - } - def _dpp8_w32_gfx10 : - VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { - string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; - let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8); - let isAsmParserOnly = 1; - } - } // End WaveSizePredicate = isWave32 - - let WaveSizePredicate = isWave64 in { - def _sdwa_w64_gfx10 : - Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, - VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { - VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa"); - let AsmString = asmName # Ps.AsmOperands; - let isAsmParserOnly = 1; - let DecoderNamespace = "SDWA10"; - } - def _dpp_w64_gfx10 : - VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { - string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; - let AsmString = asmName # AsmDPP; - let isAsmParserOnly = 1; - } - def _dpp8_w64_gfx10 : - VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { - string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; - let AsmString = asmName # AsmDPP8; - let isAsmParserOnly = 1; - } - } // End WaveSizePredicate = isWave64 + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp8_w32_gfx10 : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8); + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp8_w64_gfx10 : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } } //===----------------------------- VOP3Only -----------------------------===// @@ -1088,8 +1072,19 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" -multiclass Base_VOP2_Real_gfx10<bits<6> op> : - VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>; +multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> : + VOP2be_Real_e32_gfx10<op, opName, asmName>, + VOP2be_Real_e64_gfx10<op, opName, asmName>, + VOP2be_Real_sdwa_gfx10<op, opName, asmName>, + VOP2be_Real_dpp_gfx10<op, opName, asmName>, + VOP2be_Real_dpp8_gfx10<op, opName, asmName>; + +multiclass VOP2e_Real_gfx10<bits<6> op, string opName, string asmName> : + VOP2_Real_e32_gfx10<op>, + VOP2_Real_e64_gfx10<op>, + VOP2be_Real_sdwa_gfx10<op, opName, asmName>, + VOP2be_Real_dpp_gfx10<op, opName, asmName>, + VOP2be_Real_dpp8_gfx10<op, opName, asmName>; multiclass VOP2_Real_gfx10<bits<6> op> : VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>, @@ -1103,7 +1098,6 @@ multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName, VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>, VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>; -defm V_CNDMASK_B32 : Base_VOP2_Real_gfx10<0x001>; defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>; defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>; defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>; @@ -1136,6 +1130,9 @@ defm V_SUB_CO_CI_U32 : defm V_SUBREV_CO_CI_U32 : VOP2be_Real_gfx10<0x02a, "V_SUBBREV_U32", "v_subrev_co_ci_u32">; +defm V_CNDMASK_B32 : + VOP2e_Real_gfx10<0x001, "V_CNDMASK_B32", "v_cndmask_b32">; + // VOP3 only. defm V_BFM_B32 : VOP3Only_Real_gfx10<0x363>; defm V_BCNT_U32_B32 : VOP3Only_Real_gfx10<0x364>; @@ -1322,12 +1319,14 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> : } // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8" multiclass VOP2_SDWA_Real <bits<6> op> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in def _sdwa_vi : VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } multiclass VOP2_SDWA9_Real <bits<6> op> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; @@ -1350,12 +1349,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName let AsmString = AsmName # ps.AsmOperands; let DecoderNamespace = "GFX8"; } - def _sdwa_vi : - VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, - VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { - VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); - let AsmString = AsmName # ps.AsmOperands; - } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA>.ret in + def _sdwa_vi : + VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, + VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); + let AsmString = AsmName # ps.AsmOperands; + } foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in def _dpp_vi : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>, @@ -1383,12 +1383,13 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { let AsmString = AsmName # ps.AsmOperands; let DecoderNamespace = "GFX9"; } - def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, - VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { - VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); - let AsmString = AsmName # ps.AsmOperands; - } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9>.ret in + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, + VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); + let AsmString = AsmName # ps.AsmOperands; + } foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in def _dpp_gfx9 : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>, @@ -1410,10 +1411,11 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { let DecoderNamespace = "GFX9"; } - def _sdwa_gfx9 : - VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, - VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { - } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { + } foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in def _dpp_gfx9 : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, @@ -1554,7 +1556,7 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>; } // End SubtargetPredicate = HasDLInsts multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> { - def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; + def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>; } multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> : diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 21dbef9240e1..605425972b1c 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -112,7 +112,7 @@ class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> { class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> { list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, - imm:$cbsz, imm:$abid, imm:$blgp))]; + timm:$cbsz, timm:$abid, timm:$blgp))]; } class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> : @@ -385,12 +385,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3 } let SchedRW = [Write64Bit] in { -let SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] in { -def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>; -def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>; -def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>; +let SubtargetPredicate = isGFX6GFX7GFX10 in { +def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>; +def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>; +def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>; def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; -} // End SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] +} // End SubtargetPredicate = isGFX6GFX7GFX10 let SubtargetPredicate = isGFX8Plus in { def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>; @@ -399,21 +399,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, as } // End SubtargetPredicate = isGFX8Plus } // End SchedRW = [Write64Bit] -let Predicates = [isGFX8Plus] in { -def : GCNPat < - (getDivergentFrag<shl>.ret i64:$x, i32:$y), - (V_LSHLREV_B64 $y, $x) ->; -def : AMDGPUPat < - (getDivergentFrag<srl>.ret i64:$x, i32:$y), - (V_LSHRREV_B64 $y, $x) ->; -def : AMDGPUPat < - (getDivergentFrag<sra>.ret i64:$x, i32:$y), - (V_ASHRREV_I64 $y, $x) ->; -} - let SchedRW = [Write32Bit] in { let SubtargetPredicate = isGFX8Plus in { @@ -468,13 +453,13 @@ let FPDPRounding = 1 in { def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; let Uses = [M0, EXEC] in { def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>, - [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan), - (i32 imm:$attr), - (i32 imm:$src0_modifiers), + [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 timm:$attrchan), + (i32 timm:$attr), + (i32 timm:$src0_modifiers), (f32 VRegSrc_32:$src2), - (i32 imm:$src2_modifiers), - (i1 imm:$high), - (i1 imm:$clamp)))]>; + (i32 timm:$src2_modifiers), + (i1 timm:$high), + (i1 timm:$clamp)))]>; } // End Uses = [M0, EXEC] } // End FPDPRounding = 1 } // End renamedInGFX9 = 1 @@ -493,21 +478,21 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1 let Uses = [M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>, - [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan), - (i32 imm:$attr), - (i32 imm:$src0_modifiers), - (i1 imm:$high), - (i1 imm:$clamp), - (i32 imm:$omod)))]>; + [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 timm:$attrchan), + (i32 timm:$attr), + (i32 timm:$src0_modifiers), + (i1 timm:$high), + (i1 timm:$clamp), + (i32 timm:$omod)))]>; def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>, - [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan), - (i32 imm:$attr), - (i32 imm:$src0_modifiers), + [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 timm:$attrchan), + (i32 timm:$attr), + (i32 timm:$src0_modifiers), (f32 VRegSrc_32:$src2), - (i32 imm:$src2_modifiers), - (i1 imm:$high), - (i1 imm:$clamp), - (i32 imm:$omod)))]>; + (i32 timm:$src2_modifiers), + (i1 timm:$high), + (i1 timm:$clamp), + (i32 timm:$omod)))]>; } // End Uses = [M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 @@ -657,11 +642,11 @@ let SubtargetPredicate = isGFX10Plus in { } // End $vdst = $vdst_in, DisableEncoding $vdst_in def : GCNPat< - (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc), + (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) >; def : GCNPat< - (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc), + (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) >; } // End SubtargetPredicate = isGFX10Plus diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index 55ee5f6577cf..0c13f39fec02 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -261,6 +261,7 @@ class SDot2Pat<Instruction Inst> : GCNPat < let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; } +let IsDOT = 1 in { let SubtargetPredicate = HasDot2Insts in { def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>; @@ -277,6 +278,7 @@ def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32 def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; } // End SubtargetPredicate = HasDot1Insts +} // End let IsDOT = 1 multiclass DotPats<SDPatternOperator dot_op, VOP3PInst dot_inst> { diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index b3513e383d10..8ef0ec7b71f4 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -183,7 +183,7 @@ multiclass VOPCXInstAliases <string OpName, string Arch> { } -class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies { +class getVOPCPat64 <SDPatternOperator cond, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set i1:$sdst, (setcc (P.Src0VT @@ -202,7 +202,7 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> { multiclass VOPC_Pseudos <string opName, VOPC_Profile P, - PatLeaf cond = COND_NULL, + SDPatternOperator cond = COND_NULL, string revOp = opName, bit DefExec = 0> { @@ -225,6 +225,7 @@ multiclass VOPC_Pseudos <string opName, let isCommutable = 1; } + foreach _ = BoolToList<P.HasExtSDWA>.ret in def _sdwa : VOPC_SDWA_Pseudo <opName, P> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; @@ -236,7 +237,7 @@ multiclass VOPC_Pseudos <string opName, let SubtargetPredicate = HasSdstCMPX in { multiclass VOPCX_Pseudos <string opName, VOPC_Profile P, VOPC_Profile P_NoSDst, - PatLeaf cond = COND_NULL, + SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, P, cond, revOp, 1> { @@ -261,6 +262,7 @@ multiclass VOPCX_Pseudos <string opName, let SubtargetPredicate = HasNoSdstCMPX; } + foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; @@ -285,22 +287,23 @@ def VOPC_I16_I16 : VOPC_NoSdst_Profile<[Write32Bit], i16>; def VOPC_I32_I32 : VOPC_NoSdst_Profile<[Write32Bit], i32>; def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>; -multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL, + string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>; -multiclass VOPC_F32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_F32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>; -multiclass VOPC_F64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>; -multiclass VOPC_I16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>; -multiclass VOPC_I32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>; -multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : +multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; multiclass VOPCX_F16 <string opName, string revOp = opName> : @@ -669,6 +672,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec, let SchedRW = p.Schedule; } + foreach _ = BoolToList<p.HasExtSDWA>.ret in def _sdwa : VOPC_SDWA_Pseudo <opName, p> { let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]), !if(DefVcc, [VCC], [])); @@ -698,6 +702,7 @@ multiclass VOPCX_Class_Pseudos <string opName, let SubtargetPredicate = HasNoSdstCMPX; } + foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; @@ -737,8 +742,11 @@ defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">; defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">; defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">; defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">; + +let SubtargetPredicate = Has16BitInsts in { defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">; defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; +} //===----------------------------------------------------------------------===// // V_ICMPIntrinsic Pattern. @@ -878,6 +886,7 @@ let AssemblerPredicate = isGFX10Plus in { } } // End DecoderNamespace = "GFX10" + foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; @@ -903,6 +912,7 @@ let AssemblerPredicate = isGFX10Plus in { } } // End DecoderNamespace = "GFX10" + foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa")>, VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Pfl> { @@ -1223,10 +1233,12 @@ multiclass VOPC_Real_vi <bits<10> op> { } } + foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in def _sdwa_vi : VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 677095a354be..f208a1134a5a 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -14,6 +14,7 @@ class LetDummies { bit isReMaterializable; bit isAsCheapAsAMove; bit VOPAsmPrefer32Bit; + bit FPDPRounding; Predicate SubtargetPredicate; string Constraints; string DisableEncoding; @@ -41,9 +42,7 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins, string asm, list<dag> pattern> : InstSI <outs, ins, asm, pattern>, VOP <opName>, - SIMCInstr <opName#suffix, SIEncodingFamily.NONE>, - MnemonicAlias<opName#suffix, opName> { - + SIMCInstr <opName#suffix, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; let UseNamedOperandTable = 1; @@ -148,6 +147,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> : // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; + let OtherPredicates = ps.OtherPredicates; let AsmMatchConverter = ps.AsmMatchConverter; let AsmVariantName = ps.AsmVariantName; let Constraints = ps.Constraints; @@ -473,8 +473,7 @@ class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> { class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>, VOP <opName>, - SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>, - MnemonicAlias <opName#"_sdwa", opName> { + SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; @@ -595,8 +594,7 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 { class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>, VOP <OpName>, - SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>, - MnemonicAlias <OpName#"_dpp", OpName> { + SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; |