diff options
Diffstat (limited to 'lib/Target/AMDGPU')
135 files changed, 15690 insertions, 4183 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 7b0a7f4b6058..8f6e1e7d8846 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -11,6 +11,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -23,6 +24,7 @@ class Pass; class Target; class TargetMachine; class PassRegistry; +class Module; // R600 Passes FunctionPass *createR600VectorRegMerger(TargetMachine &tm); @@ -37,6 +39,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); +FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); @@ -45,21 +48,32 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); +FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); -ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; +ModulePass *createAMDGPULowerIntrinsicsPass(const TargetMachine *TM = nullptr); +void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); +extern char &AMDGPULowerIntrinsicsID; + void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; +void initializeSIPeepholeSDWAPass(PassRegistry &); +extern char &SIPeepholeSDWAID; + void initializeSIShrinkInstructionsPass(PassRegistry&); extern char &SIShrinkInstructionsID; void initializeSIFixSGPRCopiesPass(PassRegistry &); extern char &SIFixSGPRCopiesID; +void initializeSIFixVGPRCopiesPass(PassRegistry &); +extern char &SIFixVGPRCopiesID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; @@ -86,11 +100,11 @@ extern char &AMDGPUPromoteAllocaID; Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &TM, CodeGenOpt::Level OptLevel); -ModulePass *createAMDGPUAlwaysInlinePass(); +ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true); ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); -FunctionPass* createAMDGPUUnifyMetadataPass(); +ModulePass* createAMDGPUUnifyMetadataPass(); void initializeAMDGPUUnifyMetadataPass(PassRegistry&); extern char &AMDGPUUnifyMetadataID; @@ -112,6 +126,15 @@ extern char &SIDebuggerInsertNopsID; void initializeSIInsertWaitsPass(PassRegistry&); extern char &SIInsertWaitsID; +void initializeSIInsertWaitcntsPass(PassRegistry&); +extern char &SIInsertWaitcntsID; + +void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); +extern char &AMDGPUUnifyDivergentExitNodesID; + +ImmutablePass *createAMDGPUAAWrapperPass(); +void initializeAMDGPUAAWrapperPassPass(PassRegistry&); + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); @@ -133,43 +156,53 @@ enum TargetIndex { /// however on the GPU, each address space points to /// a separate piece of memory that is unique from other /// memory locations. -namespace AMDGPUAS { -enum AddressSpaces : unsigned { - PRIVATE_ADDRESS = 0, ///< Address space for private memory. - GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) - LOCAL_ADDRESS = 3, ///< Address space for local memory. - FLAT_ADDRESS = 4, ///< Address space for flat memory. - REGION_ADDRESS = 5, ///< Address space for region memory. - PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) - PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) +struct AMDGPUAS { + // The following address space values depend on the triple environment. + unsigned PRIVATE_ADDRESS; ///< Address space for private memory. + unsigned FLAT_ADDRESS; ///< Address space for flat memory. + unsigned REGION_ADDRESS; ///< Address space for region memory. + + // The maximum value for flat, generic, local, private, constant and region. + const static unsigned MAX_COMMON_ADDRESS = 5; + + const static unsigned GLOBAL_ADDRESS = 1; ///< Address space for global memory (RAT0, VTX0). + const static unsigned CONSTANT_ADDRESS = 2; ///< Address space for constant memory (VTX2) + const static unsigned LOCAL_ADDRESS = 3; ///< Address space for local memory. + const static unsigned PARAM_D_ADDRESS = 6; ///< Address space for direct addressible parameter memory (CONST0) + const static unsigned PARAM_I_ADDRESS = 7; ///< Address space for indirect addressible parameter memory (VTX1) // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this // order to be able to dynamically index a constant buffer, for example: // // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx - CONSTANT_BUFFER_0 = 8, - CONSTANT_BUFFER_1 = 9, - CONSTANT_BUFFER_2 = 10, - CONSTANT_BUFFER_3 = 11, - CONSTANT_BUFFER_4 = 12, - CONSTANT_BUFFER_5 = 13, - CONSTANT_BUFFER_6 = 14, - CONSTANT_BUFFER_7 = 15, - CONSTANT_BUFFER_8 = 16, - CONSTANT_BUFFER_9 = 17, - CONSTANT_BUFFER_10 = 18, - CONSTANT_BUFFER_11 = 19, - CONSTANT_BUFFER_12 = 20, - CONSTANT_BUFFER_13 = 21, - CONSTANT_BUFFER_14 = 22, - CONSTANT_BUFFER_15 = 23, + const static unsigned CONSTANT_BUFFER_0 = 8; + const static unsigned CONSTANT_BUFFER_1 = 9; + const static unsigned CONSTANT_BUFFER_2 = 10; + const static unsigned CONSTANT_BUFFER_3 = 11; + const static unsigned CONSTANT_BUFFER_4 = 12; + const static unsigned CONSTANT_BUFFER_5 = 13; + const static unsigned CONSTANT_BUFFER_6 = 14; + const static unsigned CONSTANT_BUFFER_7 = 15; + const static unsigned CONSTANT_BUFFER_8 = 16; + const static unsigned CONSTANT_BUFFER_9 = 17; + const static unsigned CONSTANT_BUFFER_10 = 18; + const static unsigned CONSTANT_BUFFER_11 = 19; + const static unsigned CONSTANT_BUFFER_12 = 20; + const static unsigned CONSTANT_BUFFER_13 = 21; + const static unsigned CONSTANT_BUFFER_14 = 22; + const static unsigned CONSTANT_BUFFER_15 = 23; // Some places use this if the address space can't be determined. - UNKNOWN_ADDRESS_SPACE = ~0u + const static unsigned UNKNOWN_ADDRESS_SPACE = ~0u; }; -} // namespace AMDGPUAS +namespace llvm { +namespace AMDGPU { +AMDGPUAS getAMDGPUAS(const Module &M); +AMDGPUAS getAMDGPUAS(const TargetMachine &TM); +AMDGPUAS getAMDGPUAS(Triple T); +} // namespace AMDGPU +} // namespace llvm #endif diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 13022009af16..2c7a2d8962d0 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -67,12 +67,24 @@ def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", "Support unaligned global loads and stores" >; +def FeatureTrapHandler: SubtargetFeature<"trap-handler", + "TrapHandler", + "true", + "Trap handler support" +>; + def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access", "UnalignedScratchAccess", "true", "Support unaligned scratch loads and stores" >; +def FeatureApertureRegs : SubtargetFeature<"aperture-regs", + "HasApertureRegs", + "true", + "Has Memory Aperture Base and Size Registers" +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -154,6 +166,12 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts", "Additional intstructions for CI+" >; +def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", + "GFX9Insts", + "true", + "Additional intstructions for GFX9+" +>; + def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime", "HasSMemRealTime", "true", @@ -172,6 +190,12 @@ def Feature16BitInsts : SubtargetFeature<"16-bit-insts", "Has i16/f16 instructions" >; +def FeatureVOP3P : SubtargetFeature<"vop3p", + "HasVOP3PInsts", + "true", + "Has VOP3P packed instructions" +>; + def FeatureMovrel : SubtargetFeature<"movrel", "HasMovrel", "true", @@ -190,16 +214,22 @@ def FeatureScalarStores : SubtargetFeature<"scalar-stores", "Has store scalar memory instructions" >; -//===------------------------------------------------------------===// -// Subtarget Features (options and debugging) -//===------------------------------------------------------------===// +def FeatureSDWA : SubtargetFeature<"sdwa", + "HasSDWA", + "true", + "Support SDWA (Sub-DWORD Addressing) extension" +>; -def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", - "FP16Denormals", +def FeatureDPP : SubtargetFeature<"dpp", + "HasDPP", "true", - "Enable half precision denormal handling" + "Support DPP (Data Parallel Primitives) extension" >; +//===------------------------------------------------------------===// +// Subtarget Features (options and debugging) +//===------------------------------------------------------------===// + // Some instructions do not support denormals despite this flag. Using // fp32 denormals also causes instructions to run at the double // precision rate for the device. @@ -209,13 +239,36 @@ def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", "Enable single precision denormal handling" >; -def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", - "FP64Denormals", +// Denormal handling for fp64 and fp16 is controlled by the same +// config register when fp16 supported. +// TODO: Do we need a separate f16 setting when not legal? +def FeatureFP64FP16Denormals : SubtargetFeature<"fp64-fp16-denormals", + "FP64FP16Denormals", "true", - "Enable double precision denormal handling", + "Enable double and half precision denormal handling", [FeatureFP64] >; +def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", + "FP64FP16Denormals", + "true", + "Enable double and half precision denormal handling", + [FeatureFP64, FeatureFP64FP16Denormals] +>; + +def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", + "FP64FP16Denormals", + "true", + "Enable half precision denormal handling", + [FeatureFP64FP16Denormals] +>; + +def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", + "DX10Clamp", + "true", + "clamp modifier clamps NaNs to 0.0" +>; + def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", "FPExceptions", "true", @@ -343,7 +396,17 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, - FeatureScalarStores, FeatureInv2PiInlineImm + FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA, + FeatureDPP + ] +>; + +def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", + [FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, + FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, + FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, + FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode ] >; @@ -399,6 +462,9 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, FeatureLDSBankCount16, FeatureXNACK]>; +def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,[]>; +def FeatureISAVersion9_0_1 : SubtargetFeatureISAVersion <9,0,1,[]>; + //===----------------------------------------------------------------------===// // Debugger related subtarget features. //===----------------------------------------------------------------------===// @@ -504,14 +570,27 @@ def isVI : Predicate < "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, AssemblerPredicate<"FeatureGCN3Encoding">; +def isGFX9 : Predicate < + "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"FeatureGFX9Insts">; + +// TODO: Either the name to be changed or we simply use IsCI! def isCIVI : Predicate < - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " - "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" ->, AssemblerPredicate<"FeatureCIInsts">; + "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"FeatureCIInsts">; def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; -def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">; +def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, + AssemblerPredicate<"Feature16BitInsts">; +def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, + AssemblerPredicate<"FeatureVOP3P">; + +def HasSDWA : Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"FeatureSDWA">; + +def HasDPP : Predicate<"Subtarget->hasDPP()">, + AssemblerPredicate<"FeatureDPP">; class PredicateControl { Predicate SubtargetPredicate; @@ -532,5 +611,6 @@ include "Processors.td" include "AMDGPUInstrInfo.td" include "AMDGPUIntrinsics.td" include "AMDGPURegisterInfo.td" +include "AMDGPURegisterBanks.td" include "AMDGPUInstructions.td" include "AMDGPUCallingConv.td" diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp new file mode 100644 index 000000000000..3c99f48e818a --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -0,0 +1,147 @@ +//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This is the AMGPU address space based alias analysis pass. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUAliasAnalysis.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-aa" + +// Register this pass... +char AMDGPUAAWrapperPass::ID = 0; +INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa", + "AMDGPU Address space based Alias Analysis", false, true) + +ImmutablePass *llvm::createAMDGPUAAWrapperPass() { + return new AMDGPUAAWrapperPass(); +} + +void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); +} + +// Must match the table in getAliasResult. +AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_) + : Arch(Arch_), AS(AS_) { + // These arrarys are indexed by address space value + // enum elements 0 ... to 5 + static const AliasResult ASAliasRulesPrivIsZero[6][6] = { + /* Private Global Constant Group Flat Region*/ + /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , MayAlias, NoAlias}, + /* Global */ {NoAlias , MayAlias, NoAlias , NoAlias , MayAlias, NoAlias}, + /* Constant */ {NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, NoAlias}, + /* Group */ {NoAlias , NoAlias , NoAlias , MayAlias, MayAlias, NoAlias}, + /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, + /* Region */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias} + }; + static const AliasResult ASAliasRulesGenIsZero[6][6] = { + /* Flat Global Constant Group Region Private */ + /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, + /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias}, + /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias}, + /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias}, + /* Region */ {MayAlias, NoAlias , NoAlias , NoAlias, MayAlias, NoAlias}, + /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias} + }; + assert(AS.MAX_COMMON_ADDRESS <= 5); + if (AS.FLAT_ADDRESS == 0) { + assert(AS.GLOBAL_ADDRESS == 1 && + AS.REGION_ADDRESS == 4 && + AS.LOCAL_ADDRESS == 3 && + AS.CONSTANT_ADDRESS == 2 && + AS.PRIVATE_ADDRESS == 5); + ASAliasRules = &ASAliasRulesGenIsZero; + } else { + assert(AS.PRIVATE_ADDRESS == 0 && + AS.GLOBAL_ADDRESS == 1 && + AS.CONSTANT_ADDRESS == 2 && + AS.LOCAL_ADDRESS == 3 && + AS.FLAT_ADDRESS == 4 && + AS.REGION_ADDRESS == 5); + ASAliasRules = &ASAliasRulesPrivIsZero; + } +} + +AliasResult AMDGPUAAResult::ASAliasRulesTy::getAliasResult(unsigned AS1, + unsigned AS2) const { + if (AS1 > AS.MAX_COMMON_ADDRESS || AS2 > AS.MAX_COMMON_ADDRESS) { + if (Arch == Triple::amdgcn) + report_fatal_error("Pointer address space out of range"); + return AS1 == AS2 ? MayAlias : NoAlias; + } + + return (*ASAliasRules)[AS1][AS2]; +} + +AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { + unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace(); + unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace(); + + AliasResult Result = ASAliasRules.getAliasResult(asA, asB); + if (Result == NoAlias) return Result; + + // Forward the query to the next alias analysis. + return AAResultBase::alias(LocA, LocB); +} + +bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, + bool OrLocal) { + const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); + + if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) { + return true; + } + + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) { + if (GV->isConstant()) + return true; + } else if (const Argument *Arg = dyn_cast<Argument>(Base)) { + const Function *F = Arg->getParent(); + + // Only assume constant memory for arguments on kernels. + switch (F->getCallingConv()) { + default: + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + break; + } + + unsigned ArgNo = Arg->getArgNo(); + /* On an argument, ReadOnly attribute indicates that the function does + not write through this pointer argument, even though it may write + to the memory that the pointer points to. + On an argument, ReadNone attribute indicates that the function does + not dereference that pointer argument, even though it may read or write + the memory that the pointer points to if accessed through other pointers. + */ + if (F->hasParamAttribute(ArgNo, Attribute::NoAlias) && + (F->hasParamAttribute(ArgNo, Attribute::ReadNone) || + F->hasParamAttribute(ArgNo, Attribute::ReadOnly))) { + return true; + } + } + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); +} diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h new file mode 100644 index 000000000000..5f8ed9b1f9a3 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -0,0 +1,102 @@ +//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This is the AMGPU address space based alias analysis pass. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H +#define LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H + +#include "AMDGPU.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +namespace llvm { + +/// A simple AA result that uses TBAA metadata to answer queries. +class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> { + friend AAResultBase<AMDGPUAAResult>; + + const DataLayout &DL; + AMDGPUAS AS; + +public: + explicit AMDGPUAAResult(const DataLayout &DL, Triple T) : AAResultBase(), + DL(DL), AS(AMDGPU::getAMDGPUAS(T)), ASAliasRules(AS, T.getArch()) {} + AMDGPUAAResult(AMDGPUAAResult &&Arg) + : AAResultBase(std::move(Arg)), DL(Arg.DL), AS(Arg.AS), + ASAliasRules(Arg.ASAliasRules){} + + /// Handle invalidation events from the new pass manager. + /// + /// By definition, this result is stateless and so remains valid. + bool invalidate(Function &, const PreservedAnalyses &) { return false; } + + AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB); + bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal); + +private: + bool Aliases(const MDNode *A, const MDNode *B) const; + bool PathAliases(const MDNode *A, const MDNode *B) const; + + class ASAliasRulesTy { + public: + ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_); + AliasResult getAliasResult(unsigned AS1, unsigned AS2) const; + private: + Triple::ArchType Arch; + AMDGPUAS AS; + const AliasResult (*ASAliasRules)[6][6]; + } ASAliasRules; +}; + +/// Analysis pass providing a never-invalidated alias analysis result. +class AMDGPUAA : public AnalysisInfoMixin<AMDGPUAA> { + friend AnalysisInfoMixin<AMDGPUAA>; + static char PassID; + +public: + typedef AMDGPUAAResult Result; + + AMDGPUAAResult run(Function &F, AnalysisManager<Function> &AM) { + return AMDGPUAAResult(F.getParent()->getDataLayout(), + Triple(F.getParent()->getTargetTriple())); + } +}; + +/// Legacy wrapper pass to provide the AMDGPUAAResult object. +class AMDGPUAAWrapperPass : public ImmutablePass { + std::unique_ptr<AMDGPUAAResult> Result; + +public: + static char ID; + + AMDGPUAAWrapperPass() : ImmutablePass(ID) { + initializeAMDGPUAAWrapperPassPass(*PassRegistry::getPassRegistry()); + } + + AMDGPUAAResult &getResult() { return *Result; } + const AMDGPUAAResult &getResult() const { return *Result; } + + bool doInitialization(Module &M) override { + Result.reset(new AMDGPUAAResult(M.getDataLayout(), + Triple(M.getTargetTriple()))); + return false; + } + bool doFinalization(Module &M) override { + Result.reset(); + return false; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +} +#endif // LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 067a16a2af7f..1d03714874e2 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -24,8 +24,10 @@ namespace { class AMDGPUAlwaysInline : public ModulePass { static char ID; + bool GlobalOpt; + public: - AMDGPUAlwaysInline() : ModulePass(ID) { } + AMDGPUAlwaysInline(bool GlobalOpt) : ModulePass(ID), GlobalOpt(GlobalOpt) { } bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; } }; @@ -45,8 +47,10 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { } } - for (GlobalAlias* A : AliasesToRemove) { - A->eraseFromParent(); + if (GlobalOpt) { + for (GlobalAlias* A : AliasesToRemove) { + A->eraseFromParent(); + } } for (Function &F : M) { @@ -70,6 +74,6 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { return false; } -ModulePass *llvm::createAMDGPUAlwaysInlinePass() { - return new AMDGPUAlwaysInline(); +ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) { + return new AMDGPUAlwaysInline(GlobalOpt); } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index c98d25e20185..3d8db7cd8af5 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" @@ -26,7 +27,9 @@ namespace { class AMDGPUAnnotateKernelFeatures : public ModulePass { private: - static bool hasAddrSpaceCast(const Function &F); + const TargetMachine *TM; + AMDGPUAS AS; + static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS); void addAttrToCallers(Function *Intrin, StringRef AttrName); bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>); @@ -34,7 +37,8 @@ private: public: static char ID; - AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { } + AMDGPUAnnotateKernelFeatures(const TargetMachine *TM_ = nullptr) : + ModulePass(ID), TM(TM_) {} bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; @@ -45,10 +49,11 @@ public: ModulePass::getAnalysisUsage(AU); } - static bool visitConstantExpr(const ConstantExpr *CE); + static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS); static bool visitConstantExprsRecursively( const Constant *EntryC, - SmallPtrSet<const Constant *, 8> &ConstantExprVisited); + SmallPtrSet<const Constant *, 8> &ConstantExprVisited, + AMDGPUAS AS); }; } @@ -62,18 +67,20 @@ INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, // The queue ptr is only needed when casting to flat, not from it. -static bool castRequiresQueuePtr(unsigned SrcAS) { - return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; +static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) { + return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS; } -static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { - return castRequiresQueuePtr(ASC->getSrcAddressSpace()); +static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC, + const AMDGPUAS &AS) { + return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS); } -bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { +bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE, + AMDGPUAS AS) { if (CE->getOpcode() == Instruction::AddrSpaceCast) { unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); - return castRequiresQueuePtr(SrcAS); + return castRequiresQueuePtr(SrcAS, AS); } return false; @@ -81,7 +88,8 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( const Constant *EntryC, - SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { + SmallPtrSet<const Constant *, 8> &ConstantExprVisited, + AMDGPUAS AS) { if (!ConstantExprVisited.insert(EntryC).second) return false; @@ -94,7 +102,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( // Check this constant expression. if (const auto *CE = dyn_cast<ConstantExpr>(C)) { - if (visitConstantExpr(CE)) + if (visitConstantExpr(CE, AS)) return true; } @@ -115,13 +123,14 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( } // Return true if an addrspacecast is used that requires the queue ptr. -bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) { +bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F, + AMDGPUAS AS) { SmallPtrSet<const Constant *, 8> ConstantExprVisited; for (const BasicBlock &BB : F) { for (const Instruction &I : BB) { if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { - if (castRequiresQueuePtr(ASC)) + if (castRequiresQueuePtr(ASC, AS)) return true; } @@ -130,7 +139,7 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) { if (!OpC) continue; - if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) + if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) return true; } } @@ -170,6 +179,7 @@ bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics( bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { Triple TT(M.getTargetTriple()); + AS = AMDGPU::getAMDGPUAS(M); static const StringRef IntrinsicToAttr[][2] = { // .x omitted @@ -190,7 +200,9 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { static const StringRef HSAIntrinsicToAttr[][2] = { { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }, { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" }, - { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" } + { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" }, + { "llvm.trap", "amdgpu-queue-ptr" }, + { "llvm.debugtrap", "amdgpu-queue-ptr" } }; // TODO: We should not add the attributes if the known compile time workgroup @@ -209,7 +221,9 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { if (F.hasFnAttribute("amdgpu-queue-ptr")) continue; - if (hasAddrSpaceCast(F)) + bool HasApertureRegs = + TM && TM->getSubtarget<AMDGPUSubtarget>(F).hasApertureRegs(); + if (!HasApertureRegs && hasAddrSpaceCast(F, AS)) F.addFnAttr("amdgpu-queue-ptr"); } } @@ -217,6 +231,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { return Changed; } -ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { - return new AMDGPUAnnotateKernelFeatures(); +ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM) { + return new AMDGPUAnnotateKernelFeatures(TM); } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index c011be6fa169..91b3649f5c39 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -37,6 +37,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass, LoopInfo *LI; DenseMap<Value*, GetElementPtrInst*> noClobberClones; bool isKernelFunc; + AMDGPUAS AMDGPUASI; public: static char ID; @@ -130,8 +131,8 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; - auto isGlobalLoad = [](LoadInst &Load)->bool { - return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; + auto isGlobalLoad = [&](LoadInst &Load)->bool { + return Load.getPointerAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; }; // We're tracking up to the Function boundaries // We cannot go beyond because of FunctionPass restrictions @@ -166,6 +167,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { } bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { + AMDGPUASI = AMDGPU::getAMDGPUAS(M); return false; } diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 974e79fff3d7..0446655830d1 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -17,11 +17,11 @@ // #include "AMDGPUAsmPrinter.h" +#include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "InstPrinter/AMDGPUInstPrinter.h" #include "Utils/AMDGPUBaseInfo.h" #include "AMDGPU.h" -#include "AMDKernelCodeT.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" @@ -93,33 +93,40 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() { AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer)) {} + : AsmPrinter(TM, std::move(Streamer)) { + AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS(); + } StringRef AMDGPUAsmPrinter::getPassName() const { return "AMDGPU Assembly Printer"; } +const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const { + return TM.getMCSubtargetInfo(); +} + +AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const { + return static_cast<AMDGPUTargetStreamer&>(*OutStreamer->getTargetStreamer()); +} + void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { if (TM.getTargetTriple().getOS() != Triple::AMDHSA) return; - // Need to construct an MCSubtargetInfo here in case we have no functions - // in the module. - std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo( - TM.getTargetTriple().str(), TM.getTargetCPU(), - TM.getTargetFeatureString())); - - AMDGPUTargetStreamer *TS = - static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits()); - TS->EmitDirectiveHSACodeObjectVersion(2, 1); + getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1); + getTargetStreamer().EmitDirectiveHSACodeObjectISA( + ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); + getTargetStreamer().EmitStartOfCodeObjectMetadata(M); +} - AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); - TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, - "AMD", "AMDGPU"); +void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + return; - // Emit runtime metadata. - TS->EmitRuntimeMetadata(M); + getTargetStreamer().EmitEndOfCodeObjectMetadata(); } bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( @@ -136,25 +143,32 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); } - void AMDGPUAsmPrinter::EmitFunctionBodyStart() { const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; + amd_kernel_code_t KernelCode; if (STM.isAmdCodeObjectV2(*MF)) { getSIProgramInfo(KernelInfo, *MF); - EmitAmdKernelCodeT(*MF, KernelInfo); + getAmdKernelCode(KernelCode, KernelInfo, *MF); + + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + getTargetStreamer().EmitAMDKernelCodeT(KernelCode); } + + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + return; + getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(), + KernelCode); } void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); - if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) { - AMDGPUTargetStreamer *TS = - static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) { SmallString<128> SymbolName; getNameWithPrefix(SymbolName, MF->getFunction()), - TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); + getTargetStreamer().EmitAMDGPUSymbolType( + SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); } AsmPrinter::EmitFunctionEntryLabel(); @@ -163,7 +177,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { // Group segment variables aren't emitted in HSA. - if (AMDGPU::isGroupSegment(GV)) + if (AMDGPU::isGroupSegment(GV, AMDGPUASI)) return; AsmPrinter::EmitGlobalVariable(GV); @@ -247,6 +261,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + + Twine(G_00B84C_TRAP_HANDLER(KernelInfo.ComputePGMRSrc2)), + false); OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)), false); @@ -382,6 +399,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, case AMDGPU::EXEC_HI: case AMDGPU::SCC: case AMDGPU::M0: + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: continue; case AMDGPU::VCC: @@ -478,33 +499,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ExtraSGPRs = 6; } - // Record first reserved register and reserved register count fields, and - // update max register counts if "amdgpu-debugger-reserve-regs" attribute was - // requested. - ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0; - ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM); - - // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and - // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" - // attribute was requested. - if (STM.debuggerEmitPrologue()) { - ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = - RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); - ProgInfo.DebuggerPrivateSegmentBufferSGPR = - RI->getHWRegIndex(MFI->getScratchRSrcReg()); - } + unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF); // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && !STM.hasSGPRInitBug()) { - unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs(); + unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); if (MaxSGPR + 1 > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm. LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "addressable scalar registers", MaxSGPR + 1, DS_Error, - DK_ResourceLimit, MaxAddressableNumSGPRs); + DK_ResourceLimit, + MaxAddressableNumSGPRs); Ctx.diagnose(Diag); MaxSGPR = MaxAddressableNumSGPRs - 1; } @@ -512,41 +520,43 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // Account for extra SGPRs and VGPRs reserved for debugger use. MaxSGPR += ExtraSGPRs; - MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM); + MaxVGPR += ExtraVGPRs; // We found the maximum register index. They start at 0, so add one to get the // number of registers. - ProgInfo.NumVGPR = MaxVGPR + 1; ProgInfo.NumSGPR = MaxSGPR + 1; + ProgInfo.NumVGPR = MaxVGPR + 1; // Adjust number of registers used to meet default/requested minimum/maximum // number of waves per execution unit request. ProgInfo.NumSGPRsForWavesPerEU = std::max( - ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU())); + ProgInfo.NumSGPR, STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); ProgInfo.NumVGPRsForWavesPerEU = std::max( - ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU())); + ProgInfo.NumVGPR, STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || STM.hasSGPRInitBug()) { - unsigned MaxNumSGPRs = STM.getMaxNumSGPRs(); - if (ProgInfo.NumSGPR > MaxNumSGPRs) { - // This can happen due to a compiler bug or when using inline asm to use the - // registers which are usually reserved for vcc etc. - + unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); + if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { + // This can happen due to a compiler bug or when using inline asm to use + // the registers which are usually reserved for vcc etc. LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "scalar registers", ProgInfo.NumSGPR, DS_Error, - DK_ResourceLimit, MaxNumSGPRs); + DK_ResourceLimit, + MaxAddressableNumSGPRs); Ctx.diagnose(Diag); - ProgInfo.NumSGPR = MaxNumSGPRs; - ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs; + ProgInfo.NumSGPR = MaxAddressableNumSGPRs; + ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; } } if (STM.hasSGPRInitBug()) { - ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + ProgInfo.NumSGPR = + AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + ProgInfo.NumSGPRsForWavesPerEU = + AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; } if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { @@ -565,13 +575,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // SGPRBlocks is actual number of SGPR blocks minus 1. ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU, - RI->getSGPRAllocGranule()); - ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1; + STM.getSGPREncodingGranule()); + ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1; // VGPRBlocks is actual number of VGPR blocks minus 1. ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU, - RI->getVGPRAllocGranule()); - ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1; + STM.getVGPREncodingGranule()); + ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1; + + // Record first reserved VGPR and number of reserved VGPRs. + ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0; + ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF); + + // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and + // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" + // attribute was requested. + if (STM.debuggerEmitPrologue()) { + ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = + RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); + ProgInfo.DebuggerPrivateSegmentBufferSGPR = + RI->getHWRegIndex(MFI->getScratchRSrcReg()); + } // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. @@ -580,7 +604,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.IEEEMode = STM.enableIEEEBit(MF); // Make clamp modifier on NaN input returns 0. - ProgInfo.DX10Clamp = 1; + ProgInfo.DX10Clamp = STM.enableDX10Clamp(); const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); ProgInfo.ScratchSize = FrameInfo.getStackSize(); @@ -635,6 +659,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.ComputePGMRSrc2 = S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | + S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) | S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | @@ -688,7 +713,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->PSInputEna, 4); + OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); } @@ -713,97 +738,88 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { } } -void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) const { +void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, + const SIProgramInfo &KernelInfo, + const MachineFunction &MF) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); - amd_kernel_code_t header; - AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits()); + AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits()); - header.compute_pgm_resource_registers = + Out.compute_pgm_resource_registers = KernelInfo.ComputePGMRSrc1 | (KernelInfo.ComputePGMRSrc2 << 32); - header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; - + Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64; - AMD_HSA_BITS_SET(header.code_properties, + AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, getElementByteSizeValue(STM.getMaxPrivateElementSize())); if (MFI->hasPrivateSegmentBuffer()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; } if (MFI->hasDispatchPtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; if (MFI->hasQueuePtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; if (MFI->hasKernargSegmentPtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; if (MFI->hasDispatchID()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; if (MFI->hasFlatScratchInit()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; - - // TODO: Private segment size + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; if (MFI->hasGridWorkgroupCountX()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; } if (MFI->hasGridWorkgroupCountY()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; } if (MFI->hasGridWorkgroupCountZ()) { - header.code_properties |= + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; } if (MFI->hasDispatchPtr()) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; if (STM.debuggerSupported()) - header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; + Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; if (STM.isXNACKEnabled()) - header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; + Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; // FIXME: Should use getKernArgSize - header.kernarg_segment_byte_size = + Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset()); - header.wavefront_sgpr_count = KernelInfo.NumSGPR; - header.workitem_vgpr_count = KernelInfo.NumVGPR; - header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; - header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; - header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; - header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; + Out.wavefront_sgpr_count = KernelInfo.NumSGPR; + Out.workitem_vgpr_count = KernelInfo.NumVGPR; + Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize; + Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize; + Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; + Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. - header.kernarg_segment_alignment = std::max((size_t)4, + Out.kernarg_segment_alignment = std::max((size_t)4, countTrailingZeros(MFI->getMaxKernArgAlign())); if (STM.debuggerEmitPrologue()) { - header.debug_wavefront_private_segment_offset_sgpr = + Out.debug_wavefront_private_segment_offset_sgpr = KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; - header.debug_private_segment_buffer_sgpr = + Out.debug_private_segment_buffer_sgpr = KernelInfo.DebuggerPrivateSegmentBufferSGPR; } - - AMDGPUTargetStreamer *TS = - static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); - - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); - TS->EmitAMDKernelCodeT(header); } bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 9a4bafef3a25..13425c8b2a0f 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -15,95 +15,84 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H -#include "AMDGPUMCInstLower.h" - +#include "AMDKernelCodeT.h" +#include "AMDGPU.h" +#include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/AsmPrinter.h" +#include <cstddef> +#include <cstdint> +#include <limits> +#include <memory> +#include <string> #include <vector> namespace llvm { + +class AMDGPUTargetStreamer; class MCOperand; class AMDGPUAsmPrinter final : public AsmPrinter { private: struct SIProgramInfo { - SIProgramInfo() : - VGPRBlocks(0), - SGPRBlocks(0), - Priority(0), - FloatMode(0), - Priv(0), - DX10Clamp(0), - DebugMode(0), - IEEEMode(0), - ScratchSize(0), - ComputePGMRSrc1(0), - LDSBlocks(0), - ScratchBlocks(0), - ComputePGMRSrc2(0), - NumVGPR(0), - NumSGPR(0), - FlatUsed(false), - NumSGPRsForWavesPerEU(0), - NumVGPRsForWavesPerEU(0), - ReservedVGPRFirst(0), - ReservedVGPRCount(0), - DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1), - DebuggerPrivateSegmentBufferSGPR((uint16_t)-1), - VCCUsed(false), - CodeLen(0) {} - // Fields set in PGM_RSRC1 pm4 packet. - uint32_t VGPRBlocks; - uint32_t SGPRBlocks; - uint32_t Priority; - uint32_t FloatMode; - uint32_t Priv; - uint32_t DX10Clamp; - uint32_t DebugMode; - uint32_t IEEEMode; - uint32_t ScratchSize; - - uint64_t ComputePGMRSrc1; + uint32_t VGPRBlocks = 0; + uint32_t SGPRBlocks = 0; + uint32_t Priority = 0; + uint32_t FloatMode = 0; + uint32_t Priv = 0; + uint32_t DX10Clamp = 0; + uint32_t DebugMode = 0; + uint32_t IEEEMode = 0; + uint32_t ScratchSize = 0; + + uint64_t ComputePGMRSrc1 = 0; // Fields set in PGM_RSRC2 pm4 packet. - uint32_t LDSBlocks; - uint32_t ScratchBlocks; + uint32_t LDSBlocks = 0; + uint32_t ScratchBlocks = 0; - uint64_t ComputePGMRSrc2; + uint64_t ComputePGMRSrc2 = 0; - uint32_t NumVGPR; - uint32_t NumSGPR; + uint32_t NumVGPR = 0; + uint32_t NumSGPR = 0; uint32_t LDSSize; - bool FlatUsed; + bool FlatUsed = false; // Number of SGPRs that meets number of waves per execution unit request. - uint32_t NumSGPRsForWavesPerEU; + uint32_t NumSGPRsForWavesPerEU = 0; // Number of VGPRs that meets number of waves per execution unit request. - uint32_t NumVGPRsForWavesPerEU; + uint32_t NumVGPRsForWavesPerEU = 0; // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first // fixed VGPR number reserved. - uint16_t ReservedVGPRFirst; + uint16_t ReservedVGPRFirst = 0; // The number of consecutive VGPRs reserved. - uint16_t ReservedVGPRCount; + uint16_t ReservedVGPRCount = 0; // Fixed SGPR number used to hold wave scratch offset for entire kernel - // execution, or uint16_t(-1) if the register is not used or not known. - uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR; + // execution, or std::numeric_limits<uint16_t>::max() if the register is not + // used or not known. + uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR = + std::numeric_limits<uint16_t>::max(); // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire - // kernel execution, or uint16_t(-1) if the register is not used or not - // known. - uint16_t DebuggerPrivateSegmentBufferSGPR; + // kernel execution, or std::numeric_limits<uint16_t>::max() if the register + // is not used or not known. + uint16_t DebuggerPrivateSegmentBufferSGPR = + std::numeric_limits<uint16_t>::max(); // Bonus information for debugging. - bool VCCUsed; - uint64_t CodeLen; + bool VCCUsed = false; + uint64_t CodeLen = 0; + + SIProgramInfo() = default; }; void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; + void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, + const MachineFunction &MF) const; void findNumUsedRegistersSI(const MachineFunction &MF, unsigned &NumSGPR, unsigned &NumVGPR) const; @@ -112,21 +101,28 @@ private: /// can correctly setup the GPU state. void EmitProgramInfoR600(const MachineFunction &MF); void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); - void EmitAmdKernelCodeT(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) const; public: explicit AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer); - bool runOnMachineFunction(MachineFunction &MF) override; - StringRef getPassName() const override; + const MCSubtargetInfo* getSTI() const; + + AMDGPUTargetStreamer& getTargetStreamer() const; + + bool runOnMachineFunction(MachineFunction &MF) override; + /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated /// pseudo lowering. bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; + /// \brief Lower the specified LLVM Constant to an MCExpr. + /// The AsmPrinter::lowerConstantof does not know how to lower + /// addrspacecast, therefore they should be lowered by this function. + const MCExpr *lowerConstant(const Constant *CV) override; + /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo /// instructions. bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, @@ -143,6 +139,8 @@ public: void EmitStartOfAsmFile(Module &M) override; + void EmitEndOfAsmFile(Module &M) override; + bool isBlockOnlyReachableByFallthrough( const MachineBasicBlock *MBB) const override; @@ -153,8 +151,9 @@ public: protected: std::vector<std::string> DisasmLines, HexLines; size_t DisasmLineMaxLen; + AMDGPUAS AMDGPUASI; }; -} // End anonymous llvm +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index d53cc153dc9a..e67ae092fdda 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -14,8 +14,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPUCallLowering.h" +#include "AMDGPU.h" #include "AMDGPUISelLowering.h" - +#include "AMDGPUSubtarget.h" +#include "SIISelLowering.h" +#include "SIRegisterInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -26,17 +31,138 @@ using namespace llvm; #endif AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) - : CallLowering(&TLI) { + : CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) { } bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, - const Value *Val, unsigned VReg) const { + const Value *Val, unsigned VReg) const { + MIRBuilder.buildInstr(AMDGPU::S_ENDPGM); return true; } +unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, + Type *ParamTy, + unsigned Offset) const { + + MachineFunction &MF = MIRBuilder.getMF(); + const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = *MF.getFunction(); + const DataLayout &DL = F.getParent()->getDataLayout(); + PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); + LLT PtrType = getLLTForType(*PtrTy, DL); + unsigned DstReg = MRI.createGenericVirtualRegister(PtrType); + unsigned KernArgSegmentPtr = + TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); + + unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + MIRBuilder.buildConstant(OffsetReg, Offset); + + MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); + + return DstReg; +} + +void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, + Type *ParamTy, unsigned Offset, + unsigned DstReg) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = *MF.getFunction(); + const DataLayout &DL = F.getParent()->getDataLayout(); + PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + unsigned TypeSize = DL.getTypeStoreSize(ParamTy); + unsigned Align = DL.getABITypeAlignment(ParamTy); + unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | + MachineMemOperand::MONonTemporal | + MachineMemOperand::MOInvariant, + TypeSize, Align); + + MIRBuilder.buildLoad(DstReg, PtrReg, *MMO); +} + bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const { - // TODO: Implement once there are generic loads/stores. + + MachineFunction &MF = MIRBuilder.getMF(); + const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget()); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + const DataLayout &DL = F.getParent()->getDataLayout(); + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info->hasQueuePtr()) { + unsigned QueuePtrReg = Info->addQueuePtr(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(QueuePtrReg); + } + + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + const LLT P2 = LLT::pointer(2, 64); + unsigned VReg = MRI.createGenericVirtualRegister(P2); + MRI.addLiveIn(InputPtrReg, VReg); + MIRBuilder.getMBB().addLiveIn(InputPtrReg); + MIRBuilder.buildCopy(VReg, InputPtrReg); + CCInfo.AllocateReg(InputPtrReg); + } + + if (Info->hasDispatchID()) { + unsigned DispatchIDReg = Info->addDispatchID(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(DispatchIDReg); + } + + if (Info->hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + // FIXME: Need to add reg as live-in + CCInfo.AllocateReg(FlatScratchInitReg); + } + + unsigned NumArgs = F.arg_size(); + Function::const_arg_iterator CurOrigArg = F.arg_begin(); + const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); + for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { + MVT ValVT = TLI.getValueType(DL, CurOrigArg->getType()).getSimpleVT(); + ISD::ArgFlagsTy Flags; + Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); + CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), + /*IsVarArg=*/false); + bool Res = + AssignFn(i, ValVT, ValVT, CCValAssign::Full, Flags, CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + + Function::const_arg_iterator Arg = F.arg_begin(); + for (unsigned i = 0; i != NumArgs; ++i, ++Arg) { + // FIXME: We should be getting DebugInfo from the arguments some how. + CCValAssign &VA = ArgLocs[i]; + lowerParameter(MIRBuilder, Arg->getType(), + VA.getLocMemOffset() + + Subtarget->getExplicitKernelArgOffset(MF), VRegs[i]); + } + return true; } diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h index 9ae87c9397ab..09bdf8ffcde7 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H +#include "AMDGPU.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" namespace llvm { @@ -22,6 +23,14 @@ namespace llvm { class AMDGPUTargetLowering; class AMDGPUCallLowering: public CallLowering { + AMDGPUAS AMDGPUASI; + + unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, + unsigned Offset) const; + + void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, + unsigned Offset, unsigned DstReg) const; + public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); @@ -29,6 +38,7 @@ class AMDGPUCallLowering: public CallLowering { unsigned VReg) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const override; + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; }; } // End of namespace llvm; #endif diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index 47dfa4992068..d308f718aae1 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -17,7 +17,7 @@ class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {} // Calling convention for SI def CC_SI : CallingConv<[ - CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, @@ -25,17 +25,13 @@ def CC_SI : CallingConv<[ SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 ]>>>, - CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow< - [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14, - SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30, - SGPR32, SGPR34, SGPR36, SGPR38 ], - [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15, - SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31, - SGPR33, SGPR35, SGPR37, SGPR39 ] - >>>, + // We have no way of referring to the generated register tuples + // here, so use a custom function. + CCIfInReg<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>, + CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>, // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. - CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[ + CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -53,17 +49,7 @@ def CC_SI : CallingConv<[ VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119, VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127, VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135 - ]>>>, - - CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow< - [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14, - SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30, - SGPR32, SGPR34, SGPR36, SGPR38 ], - [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15, - SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31, - SGPR33, SGPR35, SGPR37, SGPR39 ] - >>> - + ]>>> ]>; def RetCC_SI : CallingConv<[ @@ -76,7 +62,7 @@ def RetCC_SI : CallingConv<[ ]>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. - CCIfType<[f32] , CCAssignToReg<[ + CCIfType<[f32, f16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index e6230547a9b3..e19314fe0a6c 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -14,16 +14,31 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" - +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include <cassert> +#include <iterator> #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -34,17 +49,15 @@ namespace { class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor<AMDGPUCodeGenPrepare, bool> { const GCNTargetMachine *TM; - const SISubtarget *ST; - DivergenceAnalysis *DA; - Module *Mod; - bool HasUnsafeFPMath; + const SISubtarget *ST = nullptr; + DivergenceAnalysis *DA = nullptr; + Module *Mod = nullptr; + bool HasUnsafeFPMath = false; /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. /// /// \returns Binary operation \p V. - Value *copyFlags(const BinaryOperator &I, Value *V) const; - /// \returns \p T's base element bit width. unsigned getBaseElementBitWidth(const Type *T) const; @@ -113,13 +126,9 @@ class AMDGPUCodeGenPrepare : public FunctionPass, public: static char ID; + AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : - FunctionPass(ID), - TM(static_cast<const GCNTargetMachine *>(TM)), - ST(nullptr), - DA(nullptr), - Mod(nullptr), - HasUnsafeFPMath(false) { } + FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {} bool visitFDiv(BinaryOperator &I); @@ -142,22 +151,7 @@ public: } }; -} // End anonymous namespace - -Value *AMDGPUCodeGenPrepare::copyFlags( - const BinaryOperator &I, Value *V) const { - BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V); - if (!BinOp) // Possibly constant expression. - return V; - - if (isa<OverflowingBinaryOperator>(BinOp)) { - BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); - BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); - } else if (isa<PossiblyExactOperator>(BinOp)) - BinOp->setIsExact(I.isExact()); - - return V; -} +} // end anonymous namespace unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { assert(needsPromotionToI32(T) && "T does not need promotion to i32"); @@ -186,12 +180,48 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { } bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { - if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 && - T->getIntegerBitWidth() <= 16) + const IntegerType *IntTy = dyn_cast<IntegerType>(T); + if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) + return true; + + if (const VectorType *VT = dyn_cast<VectorType>(T)) { + // TODO: The set of packed operations is more limited, so may want to + // promote some anyway. + if (ST->hasVOP3PInsts()) + return false; + + return needsPromotionToI32(VT->getElementType()); + } + + return false; +} + +// Return true if the op promoted to i32 should have nsw set. +static bool promotedOpIsNSW(const Instruction &I) { + switch (I.getOpcode()) { + case Instruction::Shl: + case Instruction::Add: + case Instruction::Sub: + return true; + case Instruction::Mul: + return I.hasNoUnsignedWrap(); + default: + return false; + } +} + +// Return true if the op promoted to i32 should have nuw set. +static bool promotedOpIsNUW(const Instruction &I) { + switch (I.getOpcode()) { + case Instruction::Shl: + case Instruction::Add: + case Instruction::Mul: return true; - if (!T->isVectorTy()) + case Instruction::Sub: + return I.hasNoUnsignedWrap(); + default: return false; - return needsPromotionToI32(cast<VectorType>(T)->getElementType()); + } } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { @@ -218,7 +248,19 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } - ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); + + ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); + if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { + if (promotedOpIsNSW(cast<Instruction>(I))) + Inst->setHasNoSignedWrap(); + + if (promotedOpIsNUW(cast<Instruction>(I))) + Inst->setHasNoUnsignedWrap(); + + if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) + Inst->setIsExact(ExactOp->isExact()); + } + TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); @@ -346,9 +388,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); - const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); - Function *Decl - = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); + Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index 805fb7102a35..e32ca9653b3a 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -12,11 +12,6 @@ //===----------------------------------------------------------------------===// #include "AMDGPUFrameLowering.h" -#include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/Support/MathExtras.h" using namespace llvm; AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, @@ -69,34 +64,3 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { // T1.W = stack[1].w return 1; } - -/// \returns The number of registers allocated for \p FI. -int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, - int FI, - unsigned &FrameReg) const { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - const AMDGPURegisterInfo *RI - = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo(); - - // Fill in FrameReg output argument. - FrameReg = RI->getFrameRegister(MF); - - // Start the offset at 2 so we don't overwrite work group information. - // XXX: We should only do this when the shader actually uses this - // information. - unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4); - int UpperBound = FI == -1 ? MFI.getNumObjects() : FI; - - for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) { - OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i)); - OffsetBytes += MFI.getObjectSize(i); - // Each register holds 4 bytes, so we must always align the offset to at - // least 4 bytes, so that 2 frame objects won't share the same register. - OffsetBytes = alignTo(OffsetBytes, 4); - } - - if (FI != -1) - OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI)); - - return OffsetBytes / (getStackWidth(MF) * 4); -} diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 5d51351a00d2..8e187c7e56c1 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -34,9 +34,6 @@ public: /// values to the stack. unsigned getStackWidth(const MachineFunction &MF) const; - int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const override; - bool hasFP(const MachineFunction &MF) const override { return false; } diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def new file mode 100644 index 000000000000..5cb9036f4823 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -0,0 +1,62 @@ +//===- AMDGPUGenRegisterBankInfo.def -----------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines all the static objects used by AMDGPURegisterBankInfo. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +namespace llvm { +namespace AMDGPU { + +enum PartialMappingIdx { + None = - 1, + PM_SGPR32 = 0, + PM_SGPR64 = 1, + PM_VGPR32 = 2, + PM_VGPR64 = 3 +}; + +const RegisterBankInfo::PartialMapping PartMappings[] { + // StartIdx, Length, RegBank + {0, 32, SGPRRegBank}, + {0, 64, SGPRRegBank}, + {0, 32, VGPRRegBank}, + {0, 64, VGPRRegBank} +}; + +const RegisterBankInfo::ValueMapping ValMappings[] { + // SGPR 32-bit + {&PartMappings[0], 1}, + // SGPR 64-bit + {&PartMappings[1], 1}, + // VGPR 32-bit + {&PartMappings[2], 1}, + // VGPR 64-bit + {&PartMappings[3], 1} +}; + +enum ValueMappingIdx { + SGPRStartIdx = 0, + VGPRStartIdx = 2 +}; + +const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, + unsigned Size) { + assert(Size % 32 == 0); + unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx; + Idx += (Size / 32) - 1; + return &ValMappings[Idx]; +} + +} // End AMDGPU namespace. +} // End llvm namespace. diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 5bf347e48650..318de7f2e3d2 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -67,10 +67,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. const AMDGPUSubtarget *Subtarget; + AMDGPUAS AMDGPUASI; public: explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(TM, OptLevel) {} + : SelectionDAGISel(TM, OptLevel){ + AMDGPUASI = AMDGPU::getAMDGPUAS(TM); + } ~AMDGPUDAGToDAGISel() override = default; bool runOnMachineFunction(MachineFunction &MF) override; @@ -80,6 +83,7 @@ public: private: SDValue foldFrameIndex(SDValue N) const; + bool isNoNanSrc(SDValue N) const; bool isInlineImmediate(const SDNode *N) const; bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, const R600InstrInfo *TII); @@ -143,6 +147,8 @@ private: bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; + + bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, @@ -156,7 +162,15 @@ private: SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3OMods(SDValue In, SDValue &Src, + SDValue &Clamp, SDValue &Omod) const; + + bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp) const; + void SelectADD_SUB_I64(SDNode *N); + void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); @@ -187,6 +201,17 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { return SelectionDAGISel::runOnMachineFunction(MF); } +bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { + if (TM.Options.NoNaNsFPMath) + return true; + + // TODO: Move into isKnownNeverNaN + if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(N)) + return BO->Flags.hasNoNaNs(); + + return CurDAG->isKnownNeverNaN(N); +} + bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { const SIInstrInfo *TII = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo(); @@ -250,7 +275,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS) return N; const SITargetLowering& Lowering = @@ -290,6 +315,20 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { llvm_unreachable("invalid vector size"); } +static bool getConstantValue(SDValue N, uint32_t &Out) { + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { + Out = C->getAPIntValue().getZExtValue(); + return true; + } + + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) { + Out = C->getValueAPF().bitcastToAPInt().getZExtValue(); + return true; + } + + return false; +} + void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -319,6 +358,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectADD_SUB_I64(N); return; } + case ISD::UADDO: + case ISD::USUBO: { + SelectUADDO_USUBO(N); + return; + } case AMDGPUISD::FMUL_W_CHAIN: { SelectFMUL_W_CHAIN(N); return; @@ -336,7 +380,24 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); + + if (VT == MVT::v2i16 || VT == MVT::v2f16) { + if (Opc == ISD::BUILD_VECTOR) { + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + uint32_t K = LHSVal | (RHSVal << 16); + CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT, + CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32)); + return; + } + } + + break; + } + assert(EltVT.bitsEq(MVT::i32)); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { RegClassID = selectSGPRVectorRegClassID(NumVectorElts); } else { @@ -502,7 +563,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::CopyToReg: { const SITargetLowering& Lowering = *static_cast<const SITargetLowering*>(getTargetLowering()); - Lowering.legalizeTargetIndependentNode(N, *CurDAG); + N = Lowering.legalizeTargetIndependentNode(N, *CurDAG); break; } case ISD::AND: @@ -531,9 +592,9 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { if (!N->readMem()) return false; if (CbId == -1) - return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; + return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS; - return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId; + return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId; } bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { @@ -689,6 +750,17 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { CurDAG->RemoveDeadNode(N); } +void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { + // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned + // carry out despite the _i32 name. These were renamed in VI to _U32. + // FIXME: We should probably rename the opcodes here. + unsigned Opc = N->getOpcode() == ISD::UADDO ? + AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), + { N->getOperand(0), N->getOperand(1) }); +} + void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { SDLoc SL(N); // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod @@ -1176,16 +1248,6 @@ bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr, return true; } -/// -/// \param EncodedOffset This is the immediate value that will be encoded -/// directly into the instruction. On SI/CI the \p EncodedOffset -/// will be in units of dwords and on VI+ it will be units of bytes. -static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST, - int64_t EncodedOffset) { - return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ? - isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset); -} - bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const { @@ -1197,10 +1259,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDLoc SL(ByteOffsetNode); AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration(); int64_t ByteOffset = C->getSExtValue(); - int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ? - ByteOffset >> 2 : ByteOffset; + int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset); - if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) { + if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) { Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); Imm = true; return true; @@ -1481,7 +1542,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { MemSDNode *Mem = cast<MemSDNode>(N); unsigned AS = Mem->getAddressSpace(); - if (AS == AMDGPUAS::FLAT_ADDRESS) { + if (AS == AMDGPUASI.FLAT_ADDRESS) { SelectCode(N); return; } @@ -1545,7 +1606,6 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - Src = In; if (Src.getOpcode() == ISD::FNEG) { @@ -1559,10 +1619,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, } SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + SelectVOP3Mods(In, Src, SrcMods); + return isNoNanSrc(Src); +} + bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { bool Res = SelectVOP3Mods(In, Src, SrcMods); @@ -1607,6 +1672,50 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, + SDValue &Clamp, SDValue &Omod) const { + Src = In; + + SDLoc DL(In); + // FIXME: Handle Clamp and Omod + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + Src = In; + + // FIXME: Look for on separate components + if (Src.getOpcode() == ISD::FNEG) { + Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = Src.getOperand(0); + } + + // Packed instructions do not have abs modifiers. + + // FIXME: Handle abs/neg of individual components. + // FIXME: Handle swizzling with op_sel + Mods |= SISrcMods::OP_SEL_1; + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp) const { + SDLoc SL(In); + + // FIXME: Handle clamp and op_sel + Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); + + return SelectVOP3PMods(In, Src, SrcMods); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 54caa2c5dfad..c0f336e082bd 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUCallLowering.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" @@ -43,6 +44,37 @@ static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, return true; } +static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State, + const TargetRegisterClass *RC, + unsigned NumRegs) { + ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs); + unsigned RegResult = State.AllocateReg(RegList); + if (RegResult == AMDGPU::NoRegister) + return false; + + State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo)); + return true; +} + +static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + switch (LocVT.SimpleTy) { + case MVT::i64: + case MVT::f64: + case MVT::v2i32: + case MVT::v2f32: { + // Up to SGPR0-SGPR39 + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::SGPR_64RegClass, 20); + } + default: + return false; + } +} + #include "AMDGPUGenCallingConv.inc" // Find a larger type to do a load / store of a vector with. @@ -58,6 +90,7 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { + AMDGPUASI = AMDGPU::getAMDGPUAS(TM); // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::LOAD, MVT::f32, Promote); @@ -211,10 +244,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // This is totally unsupported, just custom lower to produce an error. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); - // We need to custom lower some of the intrinsics - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - // Library functions. These default to Expand, but we have instructions // for them. setOperationAction(ISD::FCEIL, MVT::f32, Legal); @@ -270,6 +299,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { @@ -460,10 +490,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // N > 4 stores on the same chain. GatherAllAliasesMaxDepth = 16; - // FIXME: Need to really handle these. - MaxStoresPerMemcpy = 4096; - MaxStoresPerMemmove = 4096; - MaxStoresPerMemset = 4096; + // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry + // about these during lowering. + MaxStoresPerMemcpy = 0xffffffff; + MaxStoresPerMemmove = 0xffffffff; + MaxStoresPerMemset = 0xffffffff; setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::SHL); @@ -478,12 +509,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); + setTargetDAGCombine(ISD::FABS); } //===----------------------------------------------------------------------===// // Target Information //===----------------------------------------------------------------------===// +LLVM_READNONE static bool fnegFoldsIntoOp(unsigned Opc) { switch (Opc) { case ISD::FADD: @@ -491,17 +524,77 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case ISD::FMUL: case ISD::FMA: case ISD::FMAD: + case ISD::FMINNUM: + case ISD::FMAXNUM: case ISD::FSIN: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::SIN_HW: case AMDGPUISD::FMUL_LEGACY: + case AMDGPUISD::FMIN_LEGACY: + case AMDGPUISD::FMAX_LEGACY: return true; default: return false; } } +/// \p returns true if the operation will definitely need to use a 64-bit +/// encoding, and thus will use a VOP3 encoding regardless of the source +/// modifiers. +LLVM_READONLY +static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { + return N->getNumOperands() > 2 || VT == MVT::f64; +} + +// Most FP instructions support source modifiers, but this could be refined +// slightly. +LLVM_READONLY +static bool hasSourceMods(const SDNode *N) { + if (isa<MemSDNode>(N)) + return false; + + switch (N->getOpcode()) { + case ISD::CopyToReg: + case ISD::SELECT: + case ISD::FDIV: + case ISD::FREM: + case ISD::INLINEASM: + case AMDGPUISD::INTERP_P1: + case AMDGPUISD::INTERP_P2: + case AMDGPUISD::DIV_SCALE: + return false; + default: + return true; + } +} + +static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) { + // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus + // it is truly free to use a source modifier in all cases. If there are + // multiple users but for each one will necessitate using VOP3, there will be + // a code size increase. Try to avoid increasing code size unless we know it + // will save on the instruction count. + unsigned NumMayIncreaseSize = 0; + MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); + + // XXX - Should this limit number of uses to check? + for (const SDNode *U : N->uses()) { + if (!hasSourceMods(U)) + return false; + + if (!opMustUseVOP3Encoding(U, VT)) { + if (++NumMayIncreaseSize > CostThreshold) + return false; + } + } + + return true; +} + MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -580,12 +673,17 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() && - VT == MVT::f16); + + // Packed operations do not have a fabs modifier. + return VT == MVT::f32 || VT == MVT::f64 || + (Subtarget->has16BitInsts() && VT == MVT::f16); } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { - return isFAbsFree(VT); + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64 || + (Subtarget->has16BitInsts() && VT == MVT::f16) || + (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16); } bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, @@ -667,6 +765,11 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // TargetLowering Callbacks //===---------------------------------------------------------------------===// +CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, + bool IsVarArg) const { + return CC_AMDGPU; +} + /// The SelectionDAGBuilder will automatically promote function arguments /// with illegal types. However, this does not work for the AMDGPU targets /// since the function arguments are stored in memory as these illegal types. @@ -764,11 +867,6 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, } } -void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, - const SmallVectorImpl<ISD::InputArg> &Ins) const { - State.AnalyzeFormalArguments(Ins, CC_AMDGPU); -} - void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, const SmallVectorImpl<ISD::OutputArg> &Outs) const { @@ -788,6 +886,24 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Target specific lowering //===---------------------------------------------------------------------===// +/// Selects the correct CCAssignFn for a given CallingConvention value. +CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, + bool IsVarArg) { + switch (CC) { + case CallingConv::C: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return CC_AMDGPU_Kernel; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + return CC_AMDGPU; + default: + report_fatal_error("Unsupported calling convention."); + } +} + SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { SDValue Callee = CLI.Callee; @@ -829,14 +945,13 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: - Op->dump(&DAG); + Op->print(errs(), &DAG); llvm_unreachable("Custom lowering code for this" "instruction is not implemented yet!"); break; case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); case ISD::FREM: return LowerFREM(Op, DAG); @@ -892,19 +1007,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); - switch (G->getAddressSpace()) { - case AMDGPUAS::LOCAL_ADDRESS: { + if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) { // XXX: What does the value of G->getOffset() mean? assert(G->getOffset() == 0 && "Do not know what to do with an non-zero offset"); // TODO: We could emit code to handle the initialization somewhere. - if (hasDefinedInitializer(GV)) - break; - - unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); - return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); - } + if (!hasDefinedInitializer(GV)) { + unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); + return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); + } } const Function &Fn = *DAG.getMachineFunction().getFunction(); @@ -936,41 +1048,12 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); } -SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, - SelectionDAG &DAG) const { - unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - switch (IntrinsicID) { - default: return Op; - case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name. - return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfe_i32: - return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfe_u32: - return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - } -} - /// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, +SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const { - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return SDValue(); - if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) return SDValue(); @@ -1228,7 +1311,10 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa); + unsigned OpCode = Subtarget->hasFP32Denormals() ? + (unsigned)AMDGPUISD::FMAD_FTZ : + (unsigned)ISD::FMAD; + SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); // int iq = (int)fq; SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); @@ -1662,32 +1748,37 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con } // XXX - May require not supporting f32 denormals? -SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { + +// Don't handle v2f16. The extra instructions to scalarize and repack around the +// compare and vselect end up producing worse code than scalarizing the whole +// operation. +SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue X = Op.getOperand(0); + EVT VT = Op.getValueType(); - SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X); // TODO: Should this propagate fast-math-flags? - SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); + SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T); - SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); + SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff); - const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32); + const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); + const SDValue One = DAG.getConstantFP(1.0, SL, VT); + const SDValue Half = DAG.getConstantFP(0.5, SL, VT); - SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); + SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X); EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); - SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); + SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero); - return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); + return DAG.getNode(ISD::FADD, SL, VT, T, Sel); } SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { @@ -1750,8 +1841,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT == MVT::f32) - return LowerFROUND32(Op, DAG); + if (VT == MVT::f32 || VT == MVT::f16) + return LowerFROUND32_16(Op, DAG); if (VT == MVT::f64) return LowerFROUND64(Op, DAG); @@ -2030,15 +2121,19 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, } SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue N0 = Op.getOperand(0); + + // Convert to target node to get known bits + if (N0.getValueType() == MVT::f32) + return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); if (getTargetMachine().Options.UnsafeFPMath) { // There is a generic expand for FP_TO_FP16 with unsafe fast math. return SDValue(); } - SDLoc DL(Op); - SDValue N0 = Op.getOperand(0); - assert (N0.getSimpleValueType() == MVT::f64); + assert(N0.getSimpleValueType() == MVT::f64); // f64 -> f16 conversion using round-to-nearest-even rounding mode. const unsigned ExpMask = 0x7ff; @@ -2379,6 +2474,28 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SN->getBasePtr(), SN->getMemOperand()); } +SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + if (!CSrc) + return SDValue(); + + const APFloat &F = CSrc->getValueAPF(); + APFloat Zero = APFloat::getZero(F.getSemantics()); + APFloat::cmpResult Cmp0 = F.compare(Zero); + if (Cmp0 == APFloat::cmpLessThan || + (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); + } + + APFloat One(F.getSemantics(), "1.0"); + APFloat::cmpResult Cmp1 = F.compare(One); + if (Cmp1 == APFloat::cmpGreaterThan) + return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); + + return SDValue(CSrc, 0); +} + /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( @@ -2821,20 +2938,41 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); } - } - if (VT == MVT::f32 && Cond.hasOneUse()) { - SDValue MinMax - = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); - // Revisit this node so we can catch min3/max3/med3 patterns. - //DCI.AddToWorklist(MinMax.getNode()); - return MinMax; + if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { + SDValue MinMax + = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + // Revisit this node so we can catch min3/max3/med3 patterns. + //DCI.AddToWorklist(MinMax.getNode()); + return MinMax; + } } // There's no reason to not do this if the condition has other uses. return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); } +static bool isConstantFPZero(SDValue N) { + if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) + return C->isZero() && !C->isNegative(); + return false; +} + +static unsigned inverseMinMax(unsigned Opc) { + switch (Opc) { + case ISD::FMAXNUM: + return ISD::FMINNUM; + case ISD::FMINNUM: + return ISD::FMAXNUM; + case AMDGPUISD::FMAX_LEGACY: + return AMDGPUISD::FMIN_LEGACY; + case AMDGPUISD::FMIN_LEGACY: + return AMDGPUISD::FMAX_LEGACY; + default: + llvm_unreachable("invalid min/max opcode"); + } +} + SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2847,10 +2985,16 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, // the other uses cannot, give up. This both prevents unprofitable // transformations and infinite loops: we won't repeatedly try to fold around // a negate that has no 'good' form. - // - // TODO: Check users can fold - if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse()) - return SDValue(); + if (N0.hasOneUse()) { + // This may be able to fold into the source, but at a code size cost. Don't + // fold if the fold into the user is free. + if (allUsesHaveSourceMods(N, 0)) + return SDValue(); + } else { + if (fnegFoldsIntoOp(Opc) && + (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode()))) + return SDValue(); + } SDLoc SL(N); switch (Opc) { @@ -2872,7 +3016,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, else RHS = RHS.getOperand(0); - SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS); + SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -2891,7 +3035,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, else RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); - SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS); + SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); if (!N0.hasOneUse()) DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; @@ -2923,10 +3067,40 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; } + case ISD::FMAXNUM: + case ISD::FMINNUM: + case AMDGPUISD::FMAX_LEGACY: + case AMDGPUISD::FMIN_LEGACY: { + // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) + // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y) + // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y) + // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y) + + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + // 0 doesn't have a negated inline immediate. + // TODO: Shouldn't fold 1/2pi either, and should be generalized to other + // operations. + if (isConstantFPZero(RHS)) + return SDValue(); + + SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); + SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + unsigned Opposite = inverseMinMax(Opc); + + SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } case ISD::FP_EXTEND: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: // XXX - Should fround be handled? + case ISD::FSIN: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: - case ISD::FSIN: case AMDGPUISD::SIN_HW: { SDValue CvtSrc = N0.getOperand(0); if (CvtSrc.getOpcode() == ISD::FNEG) { @@ -2941,7 +3115,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, // (fneg (fp_extend x)) -> (fp_extend (fneg x)) // (fneg (rcp x)) -> (rcp (fneg x)) SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); - return DAG.getNode(Opc, SL, VT, Neg); + return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags()); } case ISD::FP_ROUND: { SDValue CvtSrc = N0.getOperand(0); @@ -2959,6 +3133,45 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); } + case ISD::FP16_TO_FP: { + // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal + // f16, but legalization of f16 fneg ends up pulling it out of the source. + // Put the fneg back as a legal source operation that can be matched later. + SDLoc SL(N); + + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) + SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, + DAG.getConstant(0x8000, SL, SrcVT)); + return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); + } + default: + return SDValue(); + } +} + +SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + + if (!N0.hasOneUse()) + return SDValue(); + + switch (N0.getOpcode()) { + case ISD::FP16_TO_FP: { + assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); + SDLoc SL(N); + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) + SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, + DAG.getConstant(0x7fff, SL, SrcVT)); + return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); + } default: return SDValue(); } @@ -3071,6 +3284,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performSelectCombine(N, DCI); case ISD::FNEG: return performFNegCombine(N, DCI); + case ISD::FABS: + return performFAbsCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && @@ -3159,6 +3374,18 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); + case AMDGPUISD::CLAMP: + return performClampCombine(N, DCI); + case AMDGPUISD::RCP: { + if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) { + // XXX - Should this flush denormals? + const APFloat &Val = CFP->getValueAPF(); + APFloat One(Val.getSemantics(), "1.0"); + return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); + } + + break; + } } return SDValue(); } @@ -3201,13 +3428,17 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AMDGPUISD::NodeType)Opcode) { case AMDGPUISD::FIRST_NUMBER: break; // AMDIL DAG nodes - NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); NODE_NAME_CASE(BRANCH_COND); // AMDGPU DAG nodes + NODE_NAME_CASE(IF) + NODE_NAME_CASE(ELSE) + NODE_NAME_CASE(LOOP) + NODE_NAME_CASE(CALL) + NODE_NAME_CASE(RET_FLAG) + NODE_NAME_CASE(RETURN_TO_EPILOG) NODE_NAME_CASE(ENDPGM) - NODE_NAME_CASE(RETURN) NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) @@ -3232,6 +3463,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(FMAD_FTZ) NODE_NAME_CASE(TRIG_PREOP) NODE_NAME_CASE(RCP) NODE_NAME_CASE(RSQ) @@ -3265,7 +3497,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) NODE_NAME_CASE(REGISTER_STORE) - NODE_NAME_CASE(LOAD_INPUT) NODE_NAME_CASE(SAMPLE) NODE_NAME_CASE(SAMPLEB) NODE_NAME_CASE(SAMPLED) @@ -3274,6 +3505,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE1) NODE_NAME_CASE(CVT_F32_UBYTE2) NODE_NAME_CASE(CVT_F32_UBYTE3) + NODE_NAME_CASE(CVT_PKRTZ_F16_F32) + NODE_NAME_CASE(FP_TO_FP16) + NODE_NAME_CASE(FP16_ZEXT) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) @@ -3338,13 +3572,11 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, } void AMDGPUTargetLowering::computeKnownBitsForTargetNode( - const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) const { + const SDValue Op, APInt &KnownZero, APInt &KnownOne, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. + unsigned BitWidth = KnownZero.getBitWidth(); + KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. APInt KnownZero2; APInt KnownOne2; @@ -3365,21 +3597,27 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( if (!CWidth) return; - unsigned BitWidth = 32; uint32_t Width = CWidth->getZExtValue() & 0x1f; if (Opc == AMDGPUISD::BFE_U32) - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); + KnownZero = APInt::getHighBitsSet(32, 32 - Width); break; } + case AMDGPUISD::FP_TO_FP16: + case AMDGPUISD::FP16_ZEXT: { + unsigned BitWidth = KnownZero.getBitWidth(); + + // High bits are zero. + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); + break; + } } } unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( - SDValue Op, - const SelectionDAG &DAG, - unsigned Depth) const { + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + unsigned Depth) const { switch (Op.getOpcode()) { case AMDGPUISD::BFE_I32: { ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); @@ -3403,7 +3641,9 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: return 31; - + case AMDGPUISD::FP_TO_FP16: + case AMDGPUISD::FP16_ZEXT: + return 16; default: return 1; } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index f6adceac6f11..d6aa0ba92bf7 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -16,6 +16,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H +#include "AMDGPU.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/Target/TargetLowering.h" namespace llvm { @@ -34,10 +36,10 @@ private: protected: const AMDGPUSubtarget *Subtarget; + AMDGPUAS AMDGPUASI; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; /// \brief Split a vector store into multiple scalar stores. /// \returns The resulting chain. @@ -47,7 +49,7 @@ protected: SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; @@ -70,6 +72,7 @@ protected: bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, @@ -85,6 +88,7 @@ protected: SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const; static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); @@ -111,8 +115,6 @@ protected: SmallVectorImpl<SDValue> &Results) const; void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl<ISD::InputArg> &Ins) const; - void AnalyzeFormalArguments(CCState &State, - const SmallVectorImpl<ISD::InputArg> &Ins) const; void AnalyzeReturn(CCState &State, const SmallVectorImpl<ISD::OutputArg> &Outs) const; @@ -120,7 +122,7 @@ public: AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); bool mayIgnoreSignedZero(SDValue Op) const { - if (getTargetMachine().Options.UnsafeFPMath) // FIXME: nsz only + if (getTargetMachine().Options.NoSignedZerosFPMath) return true; if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(Op)) @@ -158,6 +160,7 @@ public: bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; + static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, @@ -174,7 +177,7 @@ public: SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; - SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, + SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const; @@ -198,10 +201,12 @@ public: void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; - unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, + unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, + const SelectionDAG &DAG, unsigned Depth = 0) const override; /// \brief Helper function that adds Reg to the LiveIn list of the DAG's @@ -222,6 +227,10 @@ public: /// type of implicit parameter. uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const; + + AMDGPUAS getAMDGPUAS() const { + return AMDGPUASI; + } }; namespace AMDGPUISD { @@ -229,15 +238,34 @@ namespace AMDGPUISD { enum NodeType : unsigned { // AMDIL ISD Opcodes FIRST_NUMBER = ISD::BUILTIN_OP_END, - CALL, // Function call based on a single integer UMUL, // 32bit unsigned multiplication BRANCH_COND, // End AMDIL ISD Opcodes + + // Function call. + CALL, + + // Masked control flow nodes. + IF, + ELSE, + LOOP, + + // A uniform kernel return that terminates the wavefront. ENDPGM, - RETURN, + + // Return to a shader part's epilog code. + RETURN_TO_EPILOG, + + // Return with values from a non-entry function. + RET_FLAG, + DWORDADDR, FRACT, + + /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output + /// modifier behavior with dx10_enable. CLAMP, + // This is SETCC with the full mask result which is used for a compare with a // result bit per item in the wavefront. SETCC, @@ -265,6 +293,9 @@ enum NodeType : unsigned { DIV_SCALE, DIV_FMAS, DIV_FIXUP, + // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is + // treated as an illegal operation. + FMAD_FTZ, TRIG_PREOP, // 1 ULP max error for f64 // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. @@ -301,7 +332,6 @@ enum NodeType : unsigned { CONST_ADDRESS, REGISTER_LOAD, REGISTER_STORE, - LOAD_INPUT, SAMPLE, SAMPLEB, SAMPLED, @@ -312,6 +342,18 @@ enum NodeType : unsigned { CVT_F32_UBYTE1, CVT_F32_UBYTE2, CVT_F32_UBYTE3, + + // Convert two float 32 numbers into a single register holding two packed f16 + // with round to zero. + CVT_PKRTZ_F16_F32, + + // Same as the standard node, except the high bits of the resulting integer + // are known 0. + FP_TO_FP16, + + // Wrapper around fp16 results that are known to zero the high bits. + FP16_ZEXT, + /// This node is for VLIW targets and it is used to represent a vector /// that is stored in consecutive registers with the same channel. /// For example: diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index e4dc6599e156..a01f5d37c7c1 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -30,7 +30,7 @@ using namespace llvm; void AMDGPUInstrInfo::anchor() {} AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST) - : AMDGPUGenInstrInfo(-1, -1), ST(ST) {} + : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {} // FIXME: This behaves strangely. If, for example, you have 32 load + stores, // the first 16 loads will be interleaved with the stores, and the next 16 will @@ -86,6 +86,7 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { case AMDGPUSubtarget::SEA_ISLANDS: return SIEncodingFamily::SI; case AMDGPUSubtarget::VOLCANIC_ISLANDS: + case AMDGPUSubtarget::GFX9: return SIEncodingFamily::VI; // FIXME: This should never be called for r600 GPUs. diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h index bd8e389639f5..12caa5118342 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -16,11 +16,11 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H +#include "AMDGPU.h" #include "llvm/Target/TargetInstrInfo.h" #include "Utils/AMDGPUBaseInfo.h" #define GET_INSTRINFO_HEADER -#define GET_INSTRINFO_ENUM #include "AMDGPUGenInstrInfo.inc" namespace llvm { @@ -35,6 +35,8 @@ private: const AMDGPUSubtarget &ST; virtual void anchor(); +protected: + AMDGPUAS AMDGPUASI; public: explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index d7fa28bdc001..56f060984f08 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -31,6 +31,10 @@ def AMDGPUFPClassOp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] >; +def AMDGPUFPPackOp : SDTypeProfile<1, 2, + [SDTCisFP<1>, SDTCisSameAs<1, 2>] +>; + def AMDGPUDivScaleOp : SDTypeProfile<2, 3, [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] >; @@ -42,10 +46,38 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4, def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def AMDGPUIfOp : SDTypeProfile<1, 2, + [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>] +>; + +def AMDGPUElseOp : SDTypeProfile<1, 2, + [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>] +>; + +def AMDGPULoopOp : SDTypeProfile<0, 2, + [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>] +>; + +def AMDGPUBreakOp : SDTypeProfile<1, 1, + [SDTCisVT<0, i64>, SDTCisVT<1, i64>] +>; + +def AMDGPUIfBreakOp : SDTypeProfile<1, 2, + [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>] +>; + +def AMDGPUElseBreakOp : SDTypeProfile<1, 2, + [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>] +>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // +def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>; +def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>; +def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>; + def AMDGPUconstdata_ptr : SDNode< "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<0, iPTR>]> @@ -78,6 +110,11 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; +def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; +def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; +def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>; + + def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; // out = max(a, b) a and b are floats, where a nan comparison fails. @@ -92,17 +129,7 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] >; -def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; - -// out = max(a, b) a and b are signed ints -def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, - [SDNPCommutative, SDNPAssociative] ->; - -// out = max(a, b) a and b are unsigned ints -def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, - [SDNPCommutative, SDNPAssociative] ->; +def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; // out = min(a, b) a and b are floats, where a nan comparison fails. def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, @@ -194,6 +221,8 @@ def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; // Denominator, src2 = Numerator). def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; +def AMDGPUfmad_ftz : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>; + // Look Up 2.0 / pi src0 with segment select src1[4:0] def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; @@ -291,15 +320,16 @@ def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ - SDTCisInt<0>, // i8 en - SDTCisInt<1>, // i1 vm + SDTCisInt<0>, // i8 tgt + SDTCisInt<1>, // i8 en + // i32 or f32 src0 + SDTCisSameAs<3, 2>, // f32 src1 + SDTCisSameAs<4, 2>, // f32 src2 + SDTCisSameAs<5, 2>, // f32 src3 + SDTCisInt<6>, // i1 compr // skip done - SDTCisInt<2>, // i8 tgt - SDTCisSameAs<3, 1>, // i1 compr - SDTCisFP<4>, // f32 src0 - SDTCisSameAs<5, 4>, // f32 src1 - SDTCisSameAs<6, 4>, // f32 src2 - SDTCisSameAs<7, 4> // f32 src3 + SDTCisInt<1> // i1 vm + ]>; def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp, @@ -333,5 +363,9 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; -def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone, +def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp new file mode 100644 index 000000000000..8867ed689a31 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -0,0 +1,424 @@ +//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the InstructionSelector class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstructionSelector.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterBankInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-isel" + +using namespace llvm; + +AMDGPUInstructionSelector::AMDGPUInstructionSelector( + const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI) + : InstructionSelector(), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI), AMDGPUASI(STI.getAMDGPUAS()) {} + +MachineOperand +AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, + unsigned SubIdx) const { + + MachineInstr *MI = MO.getParent(); + MachineBasicBlock *BB = MO.getParent()->getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + if (MO.isReg()) { + unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); + unsigned Reg = MO.getReg(); + BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) + .addReg(Reg, 0, ComposedSubIdx); + + return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), + MO.isKill(), MO.isDead(), MO.isUndef(), + MO.isEarlyClobber(), 0, MO.isDebug(), + MO.isInternalRead()); + } + + assert(MO.isImm()); + + APInt Imm(64, MO.getImm()); + + switch (SubIdx) { + default: + llvm_unreachable("do not know to split immediate with this sub index."); + case AMDGPU::sub0: + return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); + case AMDGPU::sub1: + return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); + } +} + +bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); + unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + if (Size != 64) + return false; + + DebugLoc DL = I.getDebugLoc(); + + MachineOperand Lo1(getSubOperand64(I.getOperand(1), AMDGPU::sub0)); + MachineOperand Lo2(getSubOperand64(I.getOperand(2), AMDGPU::sub0)); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) + .add(Lo1) + .add(Lo2); + + MachineOperand Hi1(getSubOperand64(I.getOperand(1), AMDGPU::sub1)); + MachineOperand Hi2(getSubOperand64(I.getOperand(2), AMDGPU::sub1)); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) + .add(Hi1) + .add(Hi2); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg()) + .addReg(DstLo) + .addImm(AMDGPU::sub0) + .addReg(DstHi) + .addImm(AMDGPU::sub1); + + for (MachineOperand &MO : I.explicit_operands()) { + if (!MO.isReg() || TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + continue; + RBI.constrainGenericRegister(MO.getReg(), AMDGPU::SReg_64RegClass, MRI); + } + + I.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const { + return selectG_ADD(I); +} + +bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + DebugLoc DL = I.getDebugLoc(); + + // FIXME: Select store instruction based on address space + MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD)) + .add(I.getOperand(1)) + .add(I.getOperand(0)) + .addImm(0) + .addImm(0) + .addImm(0); + + // Now that we selected an opcode, we need to constrain the register + // operands to use appropriate classes. + bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); + + I.eraseFromParent(); + return Ret; +} + +bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = I.getOperand(0).getReg(); + unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + + if (Size == 32) { + I.setDesc(TII.get(AMDGPU::S_MOV_B32)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + assert(Size == 64); + + DebugLoc DL = I.getDebugLoc(); + unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + const APInt &Imm = I.getOperand(1).getCImm()->getValue(); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg) + .addImm(Imm.trunc(32).getZExtValue()); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) + .addImm(Imm.ashr(32).getZExtValue()); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(LoReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + // We can't call constrainSelectedInstRegOperands here, because it doesn't + // work for target independent opcodes + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI); +} + +static bool isConstant(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_CONSTANT; +} + +void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, + const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { + + const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); + + assert(PtrMI); + + if (PtrMI->getOpcode() != TargetOpcode::G_GEP) + return; + + GEPInfo GEPInfo(*PtrMI); + + for (unsigned i = 1, e = 3; i < e; ++i) { + const MachineOperand &GEPOp = PtrMI->getOperand(i); + const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); + assert(OpDef); + if (isConstant(*OpDef)) { + // FIXME: Is it possible to have multiple Imm parts? Maybe if we + // are lacking other optimizations. + assert(GEPInfo.Imm == 0); + GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); + continue; + } + const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); + if (OpBank->getID() == AMDGPU::SGPRRegBankID) + GEPInfo.SgprParts.push_back(GEPOp.getReg()); + else + GEPInfo.VgprParts.push_back(GEPOp.getReg()); + } + + AddrInfo.push_back(GEPInfo); + getAddrModeInfo(*PtrMI, MRI, AddrInfo); +} + +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + const Value *Ptr = MMO->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || + isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) + return true; + + const Instruction *I = dyn_cast<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + +static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) { + + if (LoadSize == 32) + return BaseOpcode; + + switch (BaseOpcode) { + case AMDGPU::S_LOAD_DWORD_IMM: + switch (LoadSize) { + case 64: + return AMDGPU::S_LOAD_DWORDX2_IMM; + case 128: + return AMDGPU::S_LOAD_DWORDX4_IMM; + case 256: + return AMDGPU::S_LOAD_DWORDX8_IMM; + case 512: + return AMDGPU::S_LOAD_DWORDX16_IMM; + } + break; + case AMDGPU::S_LOAD_DWORD_IMM_ci: + switch (LoadSize) { + case 64: + return AMDGPU::S_LOAD_DWORDX2_IMM_ci; + case 128: + return AMDGPU::S_LOAD_DWORDX4_IMM_ci; + case 256: + return AMDGPU::S_LOAD_DWORDX8_IMM_ci; + case 512: + return AMDGPU::S_LOAD_DWORDX16_IMM_ci; + } + break; + case AMDGPU::S_LOAD_DWORD_SGPR: + switch (LoadSize) { + case 64: + return AMDGPU::S_LOAD_DWORDX2_SGPR; + case 128: + return AMDGPU::S_LOAD_DWORDX4_SGPR; + case 256: + return AMDGPU::S_LOAD_DWORDX8_SGPR; + case 512: + return AMDGPU::S_LOAD_DWORDX16_SGPR; + } + break; + } + llvm_unreachable("Invalid base smrd opcode or size"); +} + +bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { + for (const GEPInfo &GEPInfo : AddrInfo) { + if (!GEPInfo.VgprParts.empty()) + return true; + } + return false; +} + +bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, + ArrayRef<GEPInfo> AddrInfo) const { + + if (!I.hasOneMemOperand()) + return false; + + if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS) + return false; + + if (!isInstrUniform(I)) + return false; + + if (hasVgprParts(AddrInfo)) + return false; + + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = I.getOperand(0).getReg(); + const DebugLoc &DL = I.getDebugLoc(); + unsigned Opcode; + unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); + + if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) { + + const GEPInfo &GEPInfo = AddrInfo[0]; + + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm); + if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) { + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); + + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addImm(EncodedImm) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + } + + if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS && + isUInt<32>(EncodedImm)) { + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize); + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addImm(EncodedImm) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + } + + if (isUInt<32>(GEPInfo.Imm)) { + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize); + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addReg(OffsetReg) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); + } + } + + unsigned PtrReg = I.getOperand(1).getReg(); + Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); + MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) + .addReg(PtrReg) + .addImm(0) + .addImm(0); // glc + return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); +} + + +bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + DebugLoc DL = I.getDebugLoc(); + unsigned DstReg = I.getOperand(0).getReg(); + unsigned PtrReg = I.getOperand(1).getReg(); + unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); + unsigned Opcode; + + SmallVector<GEPInfo, 4> AddrInfo; + + getAddrModeInfo(I, MRI, AddrInfo); + + if (selectSMRD(I, AddrInfo)) { + I.eraseFromParent(); + return true; + } + + switch (LoadSize) { + default: + llvm_unreachable("Load size not supported\n"); + case 32: + Opcode = AMDGPU::FLAT_LOAD_DWORD; + break; + case 64: + Opcode = AMDGPU::FLAT_LOAD_DWORDX2; + break; + } + + MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode)) + .add(I.getOperand(0)) + .addReg(PtrReg) + .addImm(0) + .addImm(0) + .addImm(0); + + bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); + I.eraseFromParent(); + return Ret; +} + +bool AMDGPUInstructionSelector::select(MachineInstr &I) const { + + if (!isPreISelGenericOpcode(I.getOpcode())) + return true; + + switch (I.getOpcode()) { + default: + break; + case TargetOpcode::G_ADD: + return selectG_ADD(I); + case TargetOpcode::G_CONSTANT: + return selectG_CONSTANT(I); + case TargetOpcode::G_GEP: + return selectG_GEP(I); + case TargetOpcode::G_LOAD: + return selectG_LOAD(I); + case TargetOpcode::G_STORE: + return selectG_STORE(I); + } + return false; +} diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h new file mode 100644 index 000000000000..c87102e55dfb --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -0,0 +1,67 @@ +//===- AMDGPUInstructionSelector --------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the InstructionSelector class for +/// AMDGPU. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H + +#include "AMDGPU.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" + +namespace llvm { + +class AMDGPUInstrInfo; +class AMDGPURegisterBankInfo; +class MachineInstr; +class MachineOperand; +class MachineRegisterInfo; +class SIInstrInfo; +class SIRegisterInfo; +class SISubtarget; + +class AMDGPUInstructionSelector : public InstructionSelector { +public: + AMDGPUInstructionSelector(const SISubtarget &STI, + const AMDGPURegisterBankInfo &RBI); + + bool select(MachineInstr &I) const override; +private: + struct GEPInfo { + const MachineInstr &GEP; + SmallVector<unsigned, 2> SgprParts; + SmallVector<unsigned, 2> VgprParts; + int64_t Imm; + GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { } + }; + + MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const; + bool selectG_CONSTANT(MachineInstr &I) const; + bool selectG_ADD(MachineInstr &I) const; + bool selectG_GEP(MachineInstr &I) const; + bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const; + void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, + SmallVectorImpl<GEPInfo> &AddrInfo) const; + bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const; + bool selectG_LOAD(MachineInstr &I) const; + bool selectG_STORE(MachineInstr &I) const; + + const SIInstrInfo &TII; + const SIRegisterInfo &TRI; + const AMDGPURegisterBankInfo &RBI; +protected: + AMDGPUAS AMDGPUASI; +}; + +} // End llvm namespace. +#endif diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 59cba636c586..b8d681298dee 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -72,6 +72,49 @@ def u8imm : Operand<i8> { def brtarget : Operand<OtherVT>; //===----------------------------------------------------------------------===// +// Misc. PatFrags +//===----------------------------------------------------------------------===// + +class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag< + (ops node:$src0), + (op $src0), + [{ return N->hasOneUse(); }] +>; + +class HasOneUseBinOp<SDPatternOperator op> : PatFrag< + (ops node:$src0, node:$src1), + (op $src0, $src1), + [{ return N->hasOneUse(); }] +>; + +class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag< + (ops node:$src0, node:$src1, node:$src2), + (op $src0, $src1, $src2), + [{ return N->hasOneUse(); }] +>; + +def trunc_oneuse : HasOneUseUnaryOp<trunc>; + +let Properties = [SDNPCommutative, SDNPAssociative] in { +def smax_oneuse : HasOneUseBinOp<smax>; +def smin_oneuse : HasOneUseBinOp<smin>; +def umax_oneuse : HasOneUseBinOp<umax>; +def umin_oneuse : HasOneUseBinOp<umin>; +def fminnum_oneuse : HasOneUseBinOp<fminnum>; +def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>; +def and_oneuse : HasOneUseBinOp<and>; +def or_oneuse : HasOneUseBinOp<or>; +def xor_oneuse : HasOneUseBinOp<xor>; +} // Properties = [SDNPCommutative, SDNPAssociative] + +def sub_oneuse : HasOneUseBinOp<sub>; + +def srl_oneuse : HasOneUseBinOp<srl>; +def shl_oneuse : HasOneUseBinOp<shl>; + +def select_oneuse : HasOneUseTernaryOp<select>; + +//===----------------------------------------------------------------------===// // PatLeafs for floating-point comparisons //===----------------------------------------------------------------------===// @@ -157,27 +200,11 @@ def COND_NULL : PatLeaf < //===----------------------------------------------------------------------===// -// Misc. PatFrags -//===----------------------------------------------------------------------===// - -class HasOneUseBinOp<SDPatternOperator op> : PatFrag< - (ops node:$src0, node:$src1), - (op $src0, $src1), - [{ return N->hasOneUse(); }] ->; - -class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag< - (ops node:$src0, node:$src1, node:$src2), - (op $src0, $src1, $src2), - [{ return N->hasOneUse(); }] ->; - -//===----------------------------------------------------------------------===// // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS; }]>; class PrivateLoad <SDPatternOperator op> : PrivateMemOp < @@ -195,7 +222,7 @@ def truncstorei16_private : PrivateStore <truncstorei16>; def store_private : PrivateStore <store>; class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; }]>; // Global address space loads @@ -215,7 +242,7 @@ def global_store_atomic : GlobalStore<atomic_store>; class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS; }]>; // Constant address space loads @@ -226,7 +253,7 @@ class ConstantLoad <SDPatternOperator op> : ConstantMemOp < def constant_load : ConstantLoad<load>; class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; // Local address space loads @@ -239,7 +266,7 @@ class LocalStore <SDPatternOperator op> : LocalMemOp < >; class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS; + return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUASI.FLAT_ADDRESS; }]>; class FlatLoad <SDPatternOperator op> : FlatMemOp < @@ -321,7 +348,7 @@ def local_store_aligned8bytes : Aligned8Bytes < class local_binary_atomic_op<SDNode atomic_op> : PatFrag<(ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; @@ -339,7 +366,7 @@ def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>; def mskor_global : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; }]>; multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { @@ -349,7 +376,7 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ AtomicSDNode *AN = cast<AtomicSDNode>(N); return AN->getMemoryVT() == MVT::i32 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; def _64_local : PatFrag< @@ -357,7 +384,7 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ AtomicSDNode *AN = cast<AtomicSDNode>(N); return AN->getMemoryVT() == MVT::i64 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; } @@ -367,17 +394,17 @@ multiclass global_binary_atomic_op<SDNode atomic_op> { def "" : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>; def _noret : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; def _ret : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; } defm atomic_swap_global : global_binary_atomic_op<atomic_swap>; @@ -395,22 +422,22 @@ defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; def AMDGPUatomic_cmp_swap_global : PatFrag< (ops node:$ptr, node:$value), (AMDGPUatomic_cmp_swap node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>; def atomic_cmp_swap_global : PatFrag< (ops node:$ptr, node:$cmp, node:$value), (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>; def atomic_cmp_swap_global_noret : PatFrag< (ops node:$ptr, node:$cmp, node:$value), (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; def atomic_cmp_swap_global_ret : PatFrag< (ops node:$ptr, node:$cmp, node:$value), (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; //===----------------------------------------------------------------------===// // Misc Pattern Fragments @@ -422,6 +449,7 @@ int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding int FP16_ONE = 0x3C00; +int V2FP16_ONE = 0x3C003C00; int FP32_ONE = 0x3f800000; int FP32_NEG_ONE = 0xbf800000; int FP64_ONE = 0x3ff0000000000000; @@ -452,7 +480,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "CLAMP $dst, $src0", - [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] + [(set f32:$dst, (AMDGPUclamp f32:$src0))] >; class FABS <RegisterClass rc> : AMDGPUShaderInst < @@ -565,6 +593,12 @@ multiclass BFIPatterns <Instruction BFI_INT, >; def : Pat < + (f32 (fcopysign f32:$src0, f64:$src1)), + (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, + (i32 (EXTRACT_SUBREG $src1, sub1))) + >; + + def : Pat < (f64 (fcopysign f64:$src0, f64:$src1)), (REG_SEQUENCE RC64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, @@ -602,10 +636,22 @@ def IMMPopCount : SDNodeXForm<imm, [{ MVT::i32); }]>; -class BFEPattern <Instruction BFE, Instruction MOV> : Pat < - (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), - (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) ->; +multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> { + def : Pat < + (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), + (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) + >; + + def : Pat < + (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)), + (UBFE $src, (i32 0), $width) + >; + + def : Pat < + (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)), + (SBFE $src, (i32 0), $width) + >; +} // rotr pattern class ROTRPattern <Instruction BIT_ALIGN> : Pat < @@ -618,23 +664,13 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat < class IntMed3Pat<Instruction med3Inst, SDPatternOperator max, SDPatternOperator max_oneuse, - SDPatternOperator min_oneuse> : Pat< - (max (min_oneuse i32:$src0, i32:$src1), - (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)), + SDPatternOperator min_oneuse, + ValueType vt = i32> : Pat< + (max (min_oneuse vt:$src0, vt:$src1), + (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)), (med3Inst $src0, $src1, $src2) >; -let Properties = [SDNPCommutative, SDNPAssociative] in { -def smax_oneuse : HasOneUseBinOp<smax>; -def smin_oneuse : HasOneUseBinOp<smin>; -def umax_oneuse : HasOneUseBinOp<umax>; -def umin_oneuse : HasOneUseBinOp<umin>; -} // Properties = [SDNPCommutative, SDNPAssociative] - -def sub_oneuse : HasOneUseBinOp<sub>; - -def select_oneuse : HasOneUseTernaryOp<select>; - // Special conversion patterns def cvt_rpi_i32_f32 : PatFrag < diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index 8e3471bd2083..86dc9bd9ea74 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -54,14 +54,7 @@ std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID, ArrayRef<Type*> Tys) const { // FIXME: Re-use Intrinsic::getType machinery - switch (ID) { - case AMDGPUIntrinsic::amdgcn_fdiv_fast: { - Type *F32Ty = Type::getFloatTy(Context); - return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false); - } - default: - llvm_unreachable("unhandled intrinsic"); - } + llvm_unreachable("unhandled intrinsic"); } unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, @@ -97,8 +90,8 @@ Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, Function *F = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); - AttributeSet AS = getAttributes(M->getContext(), - static_cast<AMDGPUIntrinsic::ID>(IntrID)); + AttributeList AS = + getAttributes(M->getContext(), static_cast<AMDGPUIntrinsic::ID>(IntrID)); F->setAttributes(AS); return F; } diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td index ceae0b575395..18c9bd933af2 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -12,25 +12,8 @@ //===----------------------------------------------------------------------===// let TargetPrefix = "AMDGPU", isTarget = 1 in { - def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; def int_AMDGPU_kilp : Intrinsic<[], [], []>; - - // Deprecated in favor of llvm.amdgcn.sffbh - def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - - // Deprecated in favor of separate int_amdgcn_cube* intrinsics. - def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - - // Deprecated in favor of expanded bit operations - def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - - // Deprecated in favor of llvm.amdgcn.rsq - def int_AMDGPU_rsq : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] - >; } include "SIIntrinsics.td" diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp new file mode 100644 index 000000000000..a2567a549028 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -0,0 +1,62 @@ +//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the Machinelegalizer class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AMDGPULegalizerInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/Target/TargetOpcodes.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +AMDGPULegalizerInfo::AMDGPULegalizerInfo() { + using namespace TargetOpcode; + + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + const LLT P1 = LLT::pointer(1, 64); + const LLT P2 = LLT::pointer(2, 64); + + setAction({G_CONSTANT, S64}, Legal); + + setAction({G_GEP, P1}, Legal); + setAction({G_GEP, P2}, Legal); + setAction({G_GEP, 1, S64}, Legal); + + setAction({G_LOAD, P1}, Legal); + setAction({G_LOAD, P2}, Legal); + setAction({G_LOAD, S32}, Legal); + setAction({G_LOAD, 1, P1}, Legal); + setAction({G_LOAD, 1, P2}, Legal); + + setAction({G_STORE, S32}, Legal); + setAction({G_STORE, 1, P1}, Legal); + + // FIXME: When RegBankSelect inserts copies, it will only create new + // registers with scalar types. This means we can end up with + // G_LOAD/G_STORE/G_GEP instruction with scalar types for their pointer + // operands. In assert builds, the instruction selector will assert + // if it sees a generic instruction which isn't legal, so we need to + // tell it that scalar types are legal for pointer operands + setAction({G_GEP, S64}, Legal); + setAction({G_LOAD, 1, S64}, Legal); + setAction({G_STORE, 1, S64}, Legal); + + computeTables(); +} diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h new file mode 100644 index 000000000000..291e3361f163 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -0,0 +1,30 @@ +//===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the Machinelegalizer class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H + +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" + +namespace llvm { + +class LLVMContext; + +/// This class provides the information for the target register banks. +class AMDGPULegalizerInfo : public LegalizerInfo { +public: + AMDGPULegalizerInfo(); +}; +} // End llvm namespace. +#endif diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp new file mode 100644 index 000000000000..dcb6670621ee --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -0,0 +1,160 @@ +//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" + +#define DEBUG_TYPE "amdgpu-lower-intrinsics" + +using namespace llvm; + +namespace { + +const unsigned MaxStaticSize = 1024; + +class AMDGPULowerIntrinsics : public ModulePass { +private: + const TargetMachine *TM; + + bool makeLIDRangeMetadata(Function &F) const; + +public: + static char ID; + + AMDGPULowerIntrinsics(const TargetMachine *TM = nullptr) + : ModulePass(ID), TM(TM) { } + bool runOnModule(Module &M) override; + StringRef getPassName() const override { + return "AMDGPU Lower Intrinsics"; + } +}; + +} + +char AMDGPULowerIntrinsics::ID = 0; + +char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID; + +INITIALIZE_TM_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, + "Lower intrinsics", false, false) + +// TODO: Should refine based on estimated number of accesses (e.g. does it +// require splitting based on alignment) +static bool shouldExpandOperationWithSize(Value *Size) { + ConstantInt *CI = dyn_cast<ConstantInt>(Size); + return !CI || (CI->getZExtValue() > MaxStaticSize); +} + +static bool expandMemIntrinsicUses(Function &F) { + Intrinsic::ID ID = F.getIntrinsicID(); + bool Changed = false; + + for (auto I = F.user_begin(), E = F.user_end(); I != E;) { + Instruction *Inst = cast<Instruction>(*I); + ++I; + + switch (ID) { + case Intrinsic::memcpy: { + auto *Memcpy = cast<MemCpyInst>(Inst); + if (shouldExpandOperationWithSize(Memcpy->getLength())) { + expandMemCpyAsLoop(Memcpy); + Changed = true; + Memcpy->eraseFromParent(); + } + + break; + } + case Intrinsic::memmove: { + auto *Memmove = cast<MemMoveInst>(Inst); + if (shouldExpandOperationWithSize(Memmove->getLength())) { + expandMemMoveAsLoop(Memmove); + Changed = true; + Memmove->eraseFromParent(); + } + + break; + } + case Intrinsic::memset: { + auto *Memset = cast<MemSetInst>(Inst); + if (shouldExpandOperationWithSize(Memset->getLength())) { + expandMemSetAsLoop(Memset); + Changed = true; + Memset->eraseFromParent(); + } + + break; + } + default: + break; + } + } + + return Changed; +} + +bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const { + if (!TM) + return false; + + bool Changed = false; + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + + for (auto *U : F.users()) { + auto *CI = dyn_cast<CallInst>(U); + if (!CI) + continue; + + Changed |= ST.makeLIDRangeMetadata(CI); + } + return Changed; +} + +bool AMDGPULowerIntrinsics::runOnModule(Module &M) { + bool Changed = false; + + for (Function &F : M) { + if (!F.isDeclaration()) + continue; + + switch (F.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + if (expandMemIntrinsicUses(F)) + Changed = true; + break; + + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + case Intrinsic::r600_read_local_size_x: + case Intrinsic::r600_read_local_size_y: + case Intrinsic::r600_read_local_size_z: + Changed |= makeLIDRangeMetadata(F); + break; + + default: + break; + } + } + + return Changed; +} + +ModulePass *llvm::createAMDGPULowerIntrinsicsPass(const TargetMachine *TM) { + return new AMDGPULowerIntrinsics(TM); +} diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 7d56355074b1..14ee1c81f8fa 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -151,6 +151,28 @@ bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO, return MCInstLowering.lowerOperand(MO, MCOp); } +const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { + // TargetMachine does not support llvm-style cast. Use C++-style cast. + // This is safe since TM is always of type AMDGPUTargetMachine or its + // derived class. + auto *AT = static_cast<AMDGPUTargetMachine*>(&TM); + auto *CE = dyn_cast<ConstantExpr>(CV); + + // Lower null pointers in private and local address space. + // Clang generates addrspacecast for null pointers in private and local + // address space, which needs to be lowered. + if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) { + auto Op = CE->getOperand(0); + auto SrcAddr = Op->getType()->getPointerAddressSpace(); + if (Op->isNullValue() && AT->getNullPointerValue(SrcAddr) == 0) { + auto DstAddr = CE->getType()->getPointerAddressSpace(); + return MCConstantExpr::create(AT->getNullPointerValue(DstAddr), + OutContext); + } + } + return AsmPrinter::lowerConstant(CV); +} + void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; @@ -162,7 +184,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); C.emitError("Illegal instruction detected: " + Err); - MI->dump(); + MI->print(errs()); } if (MI->isBundle()) { @@ -173,8 +195,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { ++I; } } else { - // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder - // terminator instructions and should only be printed as comments. + // We don't want SI_MASK_BRANCH/SI_RETURN_TO_EPILOG encoded. They are + // placeholder terminator instructions and should only be printed as + // comments. if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) { if (isVerbose()) { SmallVector<char, 16> BBStr; @@ -190,9 +213,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } - if (MI->getOpcode() == AMDGPU::SI_RETURN) { + if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { if (isVerbose()) - OutStreamer->emitRawComment(" return"); + OutStreamer->emitRawComment(" return to shader part epilog"); return; } diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 40c3327a98db..27fe639e3d4b 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -12,6 +12,20 @@ using namespace llvm; +static bool isEntryFunctionCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + return true; + default: + return false; + } +} + AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), LocalMemoryObjects(), @@ -19,8 +33,8 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MaxKernArgAlign(0), LDSSize(0), ABIArgOffset(0), - IsKernel(MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_KERNEL || - MF.getFunction()->getCallingConv() == CallingConv::SPIR_KERNEL) { + IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())), + NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, // except reserved size is not correctly aligned. } diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 5d0640b816f3..8bfeb67ad4ec 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -30,7 +30,11 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { /// Start of implicit kernel args unsigned ABIArgOffset; - bool IsKernel; + // Kernels + shaders. i.e. functions called by the driver and not not called + // by other functions. + bool IsEntryFunction; + + bool NoSignedZerosFPMath; public: AMDGPUMachineFunction(const MachineFunction &MF); @@ -66,8 +70,12 @@ public: return LDSSize; } - bool isKernel() const { - return IsKernel; + bool isEntryFunction() const { + return IsEntryFunction; + } + + bool hasNoSignedZerosFPMath() const { + return NoSignedZerosFPMath; } unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV); diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h index 947d45b66969..71b9ab699b96 100644 --- a/lib/Target/AMDGPU/AMDGPUPTNote.h +++ b/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -19,12 +19,13 @@ namespace AMDGPU { -namespace PT_NOTE { +namespace ElfNote { const char SectionName[] = ".note"; const char NoteName[] = "AMD"; +// TODO: Move this enum to include/llvm/Support so it can be used in tools? enum NoteType{ NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1, NT_AMDGPU_HSA_HSAIL = 2, @@ -32,7 +33,7 @@ enum NoteType{ NT_AMDGPU_HSA_PRODUCER = 4, NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5, NT_AMDGPU_HSA_EXTENSION = 6, - NT_AMDGPU_HSA_RUNTIME_METADATA = 7, + NT_AMDGPU_HSA_CODE_OBJECT_METADATA = 10, NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101, NT_AMDGPU_HSA_HLDEBUG_TARGET = 102 }; diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index baa28de7a770..4fb262c6277c 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -14,12 +14,49 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <map> +#include <tuple> +#include <utility> +#include <vector> #define DEBUG_TYPE "amdgpu-promote-alloca" @@ -31,16 +68,16 @@ namespace { class AMDGPUPromoteAlloca : public FunctionPass { private: const TargetMachine *TM; - Module *Mod; - const DataLayout *DL; - MDNode *MaxWorkGroupSizeRange; + Module *Mod = nullptr; + const DataLayout *DL = nullptr; + AMDGPUAS AS; // FIXME: This should be per-kernel. - uint32_t LocalMemLimit; - uint32_t CurrentLocalMemUsage; + uint32_t LocalMemLimit = 0; + uint32_t CurrentLocalMemUsage = 0; - bool IsAMDGCN; - bool IsAMDHSA; + bool IsAMDGCN = false; + bool IsAMDHSA = false; std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder); Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); @@ -63,15 +100,7 @@ public: static char ID; AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) : - FunctionPass(ID), - TM(TM_), - Mod(nullptr), - DL(nullptr), - MaxWorkGroupSizeRange(nullptr), - LocalMemLimit(0), - CurrentLocalMemUsage(0), - IsAMDGCN(false), - IsAMDHSA(false) { } + FunctionPass(ID), TM(TM_) {} bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -86,7 +115,7 @@ public: } }; -} // End anonymous namespace +} // end anonymous namespace char AMDGPUPromoteAlloca::ID = 0; @@ -95,7 +124,6 @@ INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; - bool AMDGPUPromoteAlloca::doInitialization(Module &M) { if (!TM) return false; @@ -103,13 +131,6 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) { Mod = &M; DL = &Mod->getDataLayout(); - // The maximum workitem id. - // - // FIXME: Should get as subtarget property. Usually runtime enforced max is - // 256. - MDBuilder MDB(Mod->getContext()); - MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048)); - const Triple &TT = TM->getTargetTriple(); IsAMDGCN = TT.getArch() == Triple::amdgcn; @@ -125,6 +146,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); if (!ST.isPromoteAllocaEnabled()) return false; + AS = AMDGPU::getAMDGPUAS(*F.getParent()); FunctionType *FTy = F.getFunctionType(); @@ -133,7 +155,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // we cannot use local memory in the pass. for (Type *ParamTy : FTy->params()) { PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); - if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { LocalMemLimit = 0; DEBUG(dbgs() << "Function has local memory argument. Promoting to " "local memory disabled.\n"); @@ -150,7 +172,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // Check how much local memory is being used by global objects CurrentLocalMemUsage = 0; for (GlobalVariable &GV : Mod->globals()) { - if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) continue; for (const User *U : GV.users()) { @@ -175,7 +197,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { } } - unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage); + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, + F); // Restrict local memory usage so that we don't drastically reduce occupancy, // unless it is already significantly reduced. @@ -196,7 +219,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // Round up to the next tier of usage. unsigned MaxSizeWithWaveCount - = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy); + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); // Program is possibly broken by using more local mem than available. if (CurrentLocalMemUsage > MaxSizeWithWaveCount) @@ -226,6 +249,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { std::pair<Value *, Value *> AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>( + *Builder.GetInsertBlock()->getParent()); + if (!IsAMDHSA) { Function *LocalSizeYFn = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); @@ -235,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); - LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); - LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(LocalSizeY); + ST.makeLIDRangeMetadata(LocalSizeZ); return std::make_pair(LocalSizeY, LocalSizeZ); } @@ -279,15 +305,15 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); - DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias); - DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); + DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); // Size of the dispatch packet struct. - DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64); + DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64); Type *I32Ty = Type::getInt32Ty(Mod->getContext()); Value *CastDispatchPtr = Builder.CreateBitCast( - DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS)); + DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS)); // We could do a single 64-bit load here, but it's likely that the basic // 32-bit and extract sequence is already present, and it is probably easier @@ -298,10 +324,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2); LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4); - MDNode *MD = llvm::MDNode::get(Mod->getContext(), None); + MDNode *MD = MDNode::get(Mod->getContext(), None); LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); - LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(LoadZU); // Extract y component. Upper half of LoadZU should be zero already. Value *Y = Builder.CreateLShr(LoadXY, 16); @@ -310,6 +336,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { } Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>( + *Builder.GetInsertBlock()->getParent()); Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic; switch (N) { @@ -332,7 +360,7 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); CallInst *CI = Builder.CreateCall(WorkitemIdFn); - CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(CI); return CI; } @@ -383,7 +411,7 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { } } -static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { +static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType()); DEBUG(dbgs() << "Alloca candidate for vectorization\n"); @@ -438,7 +466,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); Value *Ptr = Inst->getOperand(0); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); @@ -450,7 +478,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { break; } case Instruction::Store: { - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); Value *Ptr = Inst->getOperand(1); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); @@ -580,6 +608,9 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( } if (UseInst->getOpcode() == Instruction::AddrSpaceCast) { + // Give up if the pointer may be captured. + if (PointerMayBeCaptured(UseInst, true, true)) + return false; // Don't collect the users of this. WorkList.push_back(User); continue; @@ -640,7 +671,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I)) { + if (tryPromoteAllocaToVector(&I, AS)) { DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); return; } @@ -655,8 +686,6 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction); - // FIXME: We should also try to get this value from the reqd_work_group_size - // function attribute if it is available. unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; const DataLayout &DL = Mod->getDataLayout(); @@ -701,7 +730,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { Twine(F->getName()) + Twine('.') + I.getName(), nullptr, GlobalVariable::NotThreadLocal, - AMDGPUAS::LOCAL_ADDRESS); + AS.LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); GV->setAlignment(I.getAlignment()); @@ -734,7 +763,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { Value *Src0 = CI->getOperand(0); Type *EltTy = Src0->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS); if (isa<ConstantPointerNull>(CI->getOperand(0))) CI->setOperand(0, ConstantPointerNull::get(NewTy)); @@ -751,7 +780,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { continue; Type *EltTy = V->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS); // FIXME: It doesn't really make sense to try to do this for all // instructions. @@ -819,17 +848,17 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { Type *SrcTy = Src->getType()->getPointerElementType(); Function *ObjectSize = Intrinsic::getDeclaration(Mod, Intrinsic::objectsize, - { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) } + { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) } ); - CallInst *NewCall - = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) }); + CallInst *NewCall = Builder.CreateCall( + ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)}); Intr->replaceAllUsesWith(NewCall); Intr->eraseFromParent(); continue; } default: - Intr->dump(); + Intr->print(errs()); llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp new file mode 100644 index 000000000000..a5edc0c3b937 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -0,0 +1,230 @@ +//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for +/// AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AMDGPURegisterBankInfo.h" +#include "AMDGPUInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +#define GET_TARGET_REGBANK_IMPL +#include "AMDGPUGenRegisterBank.inc" + +// This file will be TableGen'ed at some point. +#include "AMDGPUGenRegisterBankInfo.def" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) + : AMDGPUGenRegisterBankInfo(), + TRI(static_cast<const SIRegisterInfo*>(&TRI)) { + + // HACK: Until this is fully tablegen'd + static bool AlreadyInit = false; + if (AlreadyInit) + return; + + AlreadyInit = true; + + const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID); + (void)RBSGPR; + assert(&RBSGPR == &AMDGPU::SGPRRegBank); + + const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID); + (void)RBVGPR; + assert(&RBVGPR == &AMDGPU::VGPRRegBank); + +} + +unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A, + const RegisterBank &B, + unsigned Size) const { + return RegisterBankInfo::copyCost(A, B, Size); +} + +const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( + const TargetRegisterClass &RC) const { + + if (TRI->isSGPRClass(&RC)) + return getRegBank(AMDGPU::SGPRRegBankID); + + return getRegBank(AMDGPU::VGPRRegBankID); +} + +RegisterBankInfo::InstructionMappings +AMDGPURegisterBankInfo::getInstrAlternativeMappings( + const MachineInstr &MI) const { + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + + InstructionMappings AltMappings; + switch (MI.getOpcode()) { + case TargetOpcode::G_LOAD: { + // FIXME: Should we be hard coding the size for these mappings? + InstructionMapping SSMapping(1, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.emplace_back(std::move(SSMapping)); + + InstructionMapping VVMapping(2, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.emplace_back(std::move(VVMapping)); + + // FIXME: Should this be the pointer-size (64-bits) or the size of the + // register that will hold the bufffer resourc (128-bits). + InstructionMapping VSMapping(3, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.emplace_back(std::move(VSMapping)); + + return AltMappings; + + } + default: + break; + } + return RegisterBankInfo::getInstrAlternativeMappings(MI); +} + +void AMDGPURegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + return applyDefaultMapping(OpdMapper); +} + +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + return AMDGPU::isUniformMMO(MMO); +} + +RegisterBankInfo::InstructionMapping +AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + RegisterBankInfo::InstructionMapping Mapping = + InstructionMapping{1, 1, nullptr, MI.getNumOperands()}; + SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + + const ValueMapping *ValMapping; + const ValueMapping *PtrMapping; + + if (isInstrUniform(MI)) { + // We have a uniform instruction so we want to use an SMRD load + ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); + } else { + ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + // FIXME: What would happen if we used SGPRRegBankID here? + PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); + } + + OpdsMapping[0] = ValMapping; + OpdsMapping[1] = PtrMapping; + Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); + return Mapping; + + // FIXME: Do we want to add a mapping for FLAT load, or should we just + // handle that during instruction selection? +} + +RegisterBankInfo::InstructionMapping +AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { + RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI); + + if (Mapping.isValid()) + return Mapping; + + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + Mapping = InstructionMapping{1, 1, nullptr, MI.getNumOperands()}; + SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); + + switch (MI.getOpcode()) { + default: break; + case AMDGPU::G_CONSTANT: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); + return Mapping; + } + case AMDGPU::G_GEP: { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isReg()) + continue; + + unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits(); + OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + } + Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); + return Mapping; + } + case AMDGPU::G_STORE: { + assert(MI.getOperand(0).isReg()); + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + // FIXME: We need to specify a different reg bank once scalar stores + // are supported. + const ValueMapping *ValMapping = + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + // FIXME: Depending on the type of store, the pointer could be in + // the SGPR Reg bank. + // FIXME: Pointer size should be based on the address space. + const ValueMapping *PtrMapping = + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); + + OpdsMapping[0] = ValMapping; + OpdsMapping[1] = PtrMapping; + Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); + return Mapping; + } + + case AMDGPU::G_LOAD: + return getInstrMappingForLoad(MI); + } + + unsigned BankID = AMDGPU::SGPRRegBankID; + + Mapping = InstructionMapping{1, 1, nullptr, MI.getNumOperands()}; + unsigned Size = 0; + for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) { + // If the operand is not a register default to the size of the previous + // operand. + // FIXME: Can't we pull the types from the MachineInstr rather than the + // operands. + if (MI.getOperand(Idx).isReg()) + Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI); + OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size)); + } + Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); + + return Mapping; +} diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h new file mode 100644 index 000000000000..f13bde87ef2d --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -0,0 +1,65 @@ +//===- AMDGPURegisterBankInfo -----------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for AMDGPU. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" + +namespace llvm { + +class SIRegisterInfo; +class TargetRegisterInfo; + +namespace AMDGPU { +enum { + SGPRRegBankID = 0, + VGPRRegBankID = 1, + NumRegisterBanks +}; +} // End AMDGPU namespace. + +/// This class provides the information for the target register banks. +class AMDGPUGenRegisterBankInfo : public RegisterBankInfo { + +protected: + +#define GET_TARGET_REGBANK_CLASS +#include "AMDGPUGenRegisterBank.inc" + +}; +class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { + const SIRegisterInfo *TRI; + + /// See RegisterBankInfo::applyMapping. + void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + + RegisterBankInfo::InstructionMapping + getInstrMappingForLoad(const MachineInstr &MI) const; + +public: + AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI); + + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, + unsigned Size) const override; + + const RegisterBank & + getRegBankFromRegClass(const TargetRegisterClass &RC) const override; + + InstructionMappings + getInstrAlternativeMappings(const MachineInstr &MI) const override; + + InstructionMapping getInstrMapping(const MachineInstr &MI) const override; +}; +} // End llvm namespace. +#endif diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td new file mode 100644 index 000000000000..f4428e56035f --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -0,0 +1,16 @@ +//=- AMDGPURegisterBank.td - Describe the AMDGPU Banks -------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def SGPRRegBank : RegisterBank<"SGPR", + [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512] +>; + +def VGPRRegBank : RegisterBank<"VGPR", + [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512] +>; diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index ef51aad95dce..22b1663821d9 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -16,10 +16,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H -#include "llvm/Target/TargetRegisterInfo.h" - #define GET_REGINFO_HEADER -#define GET_REGINFO_ENUM #include "AMDGPUGenRegisterInfo.inc" namespace llvm { diff --git a/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h deleted file mode 100644 index ecd2ac72bf1b..000000000000 --- a/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h +++ /dev/null @@ -1,193 +0,0 @@ -//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// Enums and structure types used by runtime metadata. -/// -/// Runtime requests certain information (metadata) about kernels to be able -/// to execute the kernels and answer the queries about the kernels. -/// The metadata is represented as a note element in the .note ELF section of a -/// binary (code object). The desc field of the note element is a YAML string -/// consisting of key-value pairs. Each key is a string. Each value can be -/// an integer, a string, or an YAML sequence. There are 3 levels of YAML maps. -/// At the beginning of the YAML string is the module level YAML map. A -/// kernel-level YAML map is in the amd.Kernels sequence. A -/// kernel-argument-level map is in the amd.Args sequence. -/// -/// The format should be kept backward compatible. New enum values and bit -/// fields should be appended at the end. It is suggested to bump up the -/// revision number whenever the format changes and document the change -/// in the revision in this header. -/// -// -//===----------------------------------------------------------------------===// -// -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H - -#include <cstdint> -#include <vector> -#include <string> - -namespace AMDGPU { - -namespace RuntimeMD { - - // Version and revision of runtime metadata - const unsigned char MDVersion = 2; - const unsigned char MDRevision = 0; - - // Name of keys for runtime metadata. - namespace KeyName { - const char MDVersion[] = "amd.MDVersion"; // Runtime metadata version - const char Language[] = "amd.Language"; // Language - const char LanguageVersion[] = "amd.LanguageVersion"; // Language version - const char Kernels[] = "amd.Kernels"; // Kernels - const char KernelName[] = "amd.KernelName"; // Kernel name - const char Args[] = "amd.Args"; // Kernel arguments - const char ArgSize[] = "amd.ArgSize"; // Kernel arg size - const char ArgAlign[] = "amd.ArgAlign"; // Kernel arg alignment - const char ArgTypeName[] = "amd.ArgTypeName"; // Kernel type name - const char ArgName[] = "amd.ArgName"; // Kernel name - const char ArgKind[] = "amd.ArgKind"; // Kernel argument kind - const char ArgValueType[] = "amd.ArgValueType"; // Kernel argument value type - const char ArgAddrQual[] = "amd.ArgAddrQual"; // Kernel argument address qualifier - const char ArgAccQual[] = "amd.ArgAccQual"; // Kernel argument access qualifier - const char ArgIsConst[] = "amd.ArgIsConst"; // Kernel argument is const qualified - const char ArgIsRestrict[] = "amd.ArgIsRestrict"; // Kernel argument is restrict qualified - const char ArgIsVolatile[] = "amd.ArgIsVolatile"; // Kernel argument is volatile qualified - const char ArgIsPipe[] = "amd.ArgIsPipe"; // Kernel argument is pipe qualified - const char ReqdWorkGroupSize[] = "amd.ReqdWorkGroupSize"; // Required work group size - const char WorkGroupSizeHint[] = "amd.WorkGroupSizeHint"; // Work group size hint - const char VecTypeHint[] = "amd.VecTypeHint"; // Vector type hint - const char KernelIndex[] = "amd.KernelIndex"; // Kernel index for device enqueue - const char NoPartialWorkGroups[] = "amd.NoPartialWorkGroups"; // No partial work groups - const char PrintfInfo[] = "amd.PrintfInfo"; // Prinf function call information - const char ArgActualAcc[] = "amd.ArgActualAcc"; // The actual kernel argument access qualifier - const char ArgPointeeAlign[] = "amd.ArgPointeeAlign"; // Alignment of pointee type - } - - namespace KernelArg { - enum Kind : uint8_t { - ByValue = 0, - GlobalBuffer = 1, - DynamicSharedPointer = 2, - Sampler = 3, - Image = 4, - Pipe = 5, - Queue = 6, - HiddenGlobalOffsetX = 7, - HiddenGlobalOffsetY = 8, - HiddenGlobalOffsetZ = 9, - HiddenNone = 10, - HiddenPrintfBuffer = 11, - HiddenDefaultQueue = 12, - HiddenCompletionAction = 13, - }; - - enum ValueType : uint16_t { - Struct = 0, - I8 = 1, - U8 = 2, - I16 = 3, - U16 = 4, - F16 = 5, - I32 = 6, - U32 = 7, - F32 = 8, - I64 = 9, - U64 = 10, - F64 = 11, - }; - - // Avoid using 'None' since it conflicts with a macro in X11 header file. - enum AccessQualifer : uint8_t { - AccNone = 0, - ReadOnly = 1, - WriteOnly = 2, - ReadWrite = 3, - }; - - enum AddressSpaceQualifer : uint8_t { - Private = 0, - Global = 1, - Constant = 2, - Local = 3, - Generic = 4, - Region = 5, - }; - } // namespace KernelArg - - // Invalid values are used to indicate an optional key should not be emitted. - const uint8_t INVALID_ADDR_QUAL = 0xff; - const uint8_t INVALID_ACC_QUAL = 0xff; - const uint32_t INVALID_KERNEL_INDEX = ~0U; - - namespace KernelArg { - // In-memory representation of kernel argument information. - struct Metadata { - uint32_t Size; - uint32_t Align; - uint32_t PointeeAlign; - uint8_t Kind; - uint16_t ValueType; - std::string TypeName; - std::string Name; - uint8_t AddrQual; - uint8_t AccQual; - uint8_t IsVolatile; - uint8_t IsConst; - uint8_t IsRestrict; - uint8_t IsPipe; - Metadata() : Size(0), Align(0), PointeeAlign(0), Kind(0), ValueType(0), - AddrQual(INVALID_ADDR_QUAL), AccQual(INVALID_ACC_QUAL), IsVolatile(0), - IsConst(0), IsRestrict(0), IsPipe(0) {} - }; - } - - namespace Kernel { - // In-memory representation of kernel information. - struct Metadata { - std::string Name; - std::string Language; - std::vector<uint8_t> LanguageVersion; - std::vector<uint32_t> ReqdWorkGroupSize; - std::vector<uint32_t> WorkGroupSizeHint; - std::string VecTypeHint; - uint32_t KernelIndex; - uint8_t NoPartialWorkGroups; - std::vector<KernelArg::Metadata> Args; - Metadata() : KernelIndex(INVALID_KERNEL_INDEX), NoPartialWorkGroups(0) {} - }; - } - - namespace Program { - // In-memory representation of program information. - struct Metadata { - std::vector<uint8_t> MDVersionSeq; - std::vector<std::string> PrintfInfo; - std::vector<Kernel::Metadata> Kernels; - - explicit Metadata(){} - - // Construct from an YAML string. - explicit Metadata(const std::string &YAML); - - // Convert to YAML string. - std::string toYAML(); - - // Convert from YAML string. - static Metadata fromYAML(const std::string &S); - }; - } -} // namespace RuntimeMD -} // namespace AMDGPU - -#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index c35a67de1d7f..972c28579f7a 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -13,8 +13,10 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSubtarget.h" +#include "SIMachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetFrameLowering.h" #include <algorithm> @@ -22,7 +24,6 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-subtarget" -#define GET_SUBTARGETINFO_ENUM #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" @@ -41,9 +42,10 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // for SI has the unhelpful behavior that it unsets everything else if you // disable it. - SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); + SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. - FullFS += "+flat-for-global,+unaligned-buffer-access,"; + FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; + FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -59,9 +61,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // denormals, but should be checked. Should we issue a warning somewhere // if someone tries to enable these? if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - FP16Denormals = false; + FP64FP16Denormals = false; FP32Denormals = false; - FP64Denormals = false; } // Set defaults if needed. @@ -85,15 +86,17 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FastFMAF32(false), HalfRate64Ops(false), - FP16Denormals(false), FP32Denormals(false), - FP64Denormals(false), + FP64FP16Denormals(false), FPExceptions(false), + DX10Clamp(false), FlatForGlobal(false), UnalignedScratchAccess(false), UnalignedBufferAccess(false), + HasApertureRegs(false), EnableXNACK(false), + TrapHandler(false), DebuggerInsertNops(false), DebuggerReserveRegs(false), DebuggerEmitPrologue(false), @@ -110,13 +113,17 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), + GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), Has16BitInsts(false), + HasVOP3PInsts(false), HasMovrel(false), HasVGPRIndexMode(false), HasScalarStores(false), HasInv2PiInlineImm(false), + HasSDWA(false), + HasDPP(false), FlatAddressSpace(false), R600ALUInst(false), @@ -128,65 +135,30 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FeatureDisable(false), InstrItins(getInstrItineraryForCPU(GPU)) { + AS = AMDGPU::getAMDGPUAS(TT); initializeSubtargetDependencies(TT, GPU, FS); } -// FIXME: These limits are for SI. Did they change with the larger maximum LDS -// size? -unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { - switch (NWaves) { - case 10: - return 1638; - case 9: - return 1820; - case 8: - return 2048; - case 7: - return 2340; - case 6: - return 2730; - case 5: - return 3276; - case 4: - return 4096; - case 3: - return 5461; - case 2: - return 8192; - default: +unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, + const Function &F) const { + if (NWaves == 1) return getLocalMemorySize(); - } + unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + unsigned MaxWaves = getMaxWavesPerEU(); + return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } -unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { - if (Bytes <= 1638) - return 10; - - if (Bytes <= 1820) - return 9; - - if (Bytes <= 2048) - return 8; - - if (Bytes <= 2340) - return 7; - - if (Bytes <= 2730) - return 6; - - if (Bytes <= 3276) - return 5; - - if (Bytes <= 4096) - return 4; - - if (Bytes <= 5461) - return 3; - - if (Bytes <= 8192) - return 2; - - return 1; +unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, + const Function &F) const { + unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + unsigned MaxWaves = getMaxWavesPerEU(); + unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; + unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); + NumWaves = std::min(NumWaves, MaxWaves); + NumWaves = std::max(NumWaves, 1u); + return NumWaves; } std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( @@ -224,7 +196,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( const Function &F) const { // Default minimum/maximum number of waves per execution unit. - std::pair<unsigned, unsigned> Default(1, 0); + std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); // Default/requested minimum/maximum flat work group sizes. std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); @@ -269,6 +241,65 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( return Requested; } +bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { + Function *Kernel = I->getParent()->getParent(); + unsigned MinSize = 0; + unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; + bool IdQuery = false; + + // If reqd_work_group_size is present it narrows value down. + if (auto *CI = dyn_cast<CallInst>(I)) { + const Function *F = CI->getCalledFunction(); + if (F) { + unsigned Dim = UINT_MAX; + switch (F->getIntrinsicID()) { + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::r600_read_tidig_x: + IdQuery = true; + case Intrinsic::r600_read_local_size_x: + Dim = 0; + break; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + IdQuery = true; + case Intrinsic::r600_read_local_size_y: + Dim = 1; + break; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + IdQuery = true; + case Intrinsic::r600_read_local_size_z: + Dim = 2; + break; + default: + break; + } + if (Dim <= 3) { + if (auto Node = Kernel->getMetadata("reqd_work_group_size")) + if (Node->getNumOperands() == 3) + MinSize = MaxSize = mdconst::extract<ConstantInt>( + Node->getOperand(Dim))->getZExtValue(); + } + } + } + + if (!MaxSize) + return false; + + // Range metadata is [Lo, Hi). For ID query we need to pass max size + // as Hi. For size query we need to pass Hi + 1. + if (IdQuery) + MinSize = 0; + else + ++MaxSize; + + MDBuilder MDB(I->getContext()); + MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), + APInt(32, MaxSize)); + I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + return true; +} + R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : AMDGPUSubtarget(TT, GPU, FS, TM), @@ -305,7 +336,7 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { } unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, - unsigned ExplicitArgBytes) const { + unsigned ExplicitArgBytes) const { unsigned ImplicitBytes = getImplicitArgNumBytes(MF); if (ImplicitBytes == 0) return ExplicitArgBytes; @@ -359,12 +390,100 @@ unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { return 1; } -unsigned SISubtarget::getMaxNumSGPRs() const { +unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + if (MFI.hasFlatScratchInit()) { + if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). + if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) + return 4; // FLAT_SCRATCH, VCC (in that order). + } + + if (isXNACKEnabled()) + return 4; // XNACK, VCC (in that order). + return 2; // VCC. +} + +unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { + const Function &F = *MF.getFunction(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + + // Compute maximum number of SGPRs function can use using default/requested + // minimum number of waves per execution unit. + std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); + unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); + unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); + + // Check if maximum number of SGPRs was explicitly requested using + // "amdgpu-num-sgpr" attribute. + if (F.hasFnAttribute("amdgpu-num-sgpr")) { + unsigned Requested = AMDGPU::getIntegerAttribute( + F, "amdgpu-num-sgpr", MaxNumSGPRs); + + // Make sure requested value does not violate subtarget's specifications. + if (Requested && (Requested <= getReservedNumSGPRs(MF))) + Requested = 0; + + // If more SGPRs are required to support the input user/system SGPRs, + // increase to accommodate them. + // + // FIXME: This really ends up using the requested number of SGPRs + number + // of reserved special registers in total. Theoretically you could re-use + // the last input registers for these special registers, but this would + // require a lot of complexity to deal with the weird aliasing. + unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); + if (Requested && Requested < InputNumSGPRs) + Requested = InputNumSGPRs; + + // Make sure requested value is compatible with values implied by + // default/requested minimum/maximum number of waves per execution unit. + if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) + Requested = 0; + if (WavesPerEU.second && + Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) + Requested = 0; + + if (Requested) + MaxNumSGPRs = Requested; + } + if (hasSGPRInitBug()) - return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + + return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), + MaxAddressableNumSGPRs); +} - if (getGeneration() >= VOLCANIC_ISLANDS) - return 102; +unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { + const Function &F = *MF.getFunction(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + + // Compute maximum number of VGPRs function can use using default/requested + // minimum number of waves per execution unit. + std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); + unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); + + // Check if maximum number of VGPRs was explicitly requested using + // "amdgpu-num-vgpr" attribute. + if (F.hasFnAttribute("amdgpu-num-vgpr")) { + unsigned Requested = AMDGPU::getIntegerAttribute( + F, "amdgpu-num-vgpr", MaxNumVGPRs); + + // Make sure requested value does not violate subtarget's specifications. + if (Requested && Requested <= getReservedNumVGPRs(MF)) + Requested = 0; + + // Make sure requested value is compatible with values implied by + // default/requested minimum/maximum number of waves per execution unit. + if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) + Requested = 0; + if (WavesPerEU.second && + Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) + Requested = 0; + + if (Requested) + MaxNumVGPRs = Requested; + } - return 104; + return MaxNumVGPRs - getReservedNumVGPRs(MF); } diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 0e3cb7dc1f87..36bc2498781f 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -22,6 +22,7 @@ #include "SIInstrInfo.h" #include "SIISelLowering.h" #include "SIFrameLowering.h" +#include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" @@ -51,6 +52,7 @@ public: SOUTHERN_ISLANDS, SEA_ISLANDS, VOLCANIC_ISLANDS, + GFX9, }; enum { @@ -64,6 +66,28 @@ public: ISAVersion8_0_3, ISAVersion8_0_4, ISAVersion8_1_0, + ISAVersion9_0_0, + ISAVersion9_0_1 + }; + + enum TrapHandlerAbi { + TrapHandlerAbiNone = 0, + TrapHandlerAbiHsa = 1 + }; + + enum TrapID { + TrapIDHardwareReserved = 0, + TrapIDHSADebugTrap = 1, + TrapIDLLVMTrap = 2, + TrapIDLLVMDebugTrap = 3, + TrapIDDebugBreakpoint = 7, + TrapIDDebugReserved8 = 8, + TrapIDDebugReservedFE = 0xfe, + TrapIDDebugReservedFF = 0xff + }; + + enum TrapRegValues { + LLVMTrapHandlerRegValue = 1 }; protected: @@ -81,14 +105,16 @@ protected: bool HalfRate64Ops; // Dynamially set bits that enable features. - bool FP16Denormals; bool FP32Denormals; - bool FP64Denormals; + bool FP64FP16Denormals; bool FPExceptions; + bool DX10Clamp; bool FlatForGlobal; bool UnalignedScratchAccess; bool UnalignedBufferAccess; + bool HasApertureRegs; bool EnableXNACK; + bool TrapHandler; bool DebuggerInsertNops; bool DebuggerReserveRegs; bool DebuggerEmitPrologue; @@ -107,13 +133,17 @@ protected: bool GCN1Encoding; bool GCN3Encoding; bool CIInsts; + bool GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; bool Has16BitInsts; + bool HasVOP3PInsts; bool HasMovrel; bool HasVGPRIndexMode; bool HasScalarStores; bool HasInv2PiInlineImm; + bool HasSDWA; + bool HasDPP; bool FlatAddressSpace; bool R600ALUInst; bool CaymanISA; @@ -127,6 +157,7 @@ protected: InstrItineraryData InstrItins; SelectionDAGTargetInfo TSInfo; + AMDGPUAS AS; public: AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, @@ -184,10 +215,18 @@ public: return MaxPrivateElementSize; } + AMDGPUAS getAMDGPUAS() const { + return AS; + } + bool has16BitInsts() const { return Has16BitInsts; } + bool hasVOP3PInsts() const { + return HasVOP3PInsts; + } + bool hasHWFP64() const { return FP64; } @@ -243,6 +282,10 @@ public: return (getGeneration() >= EVERGREEN); } + bool hasMed3_16() const { + return getGeneration() >= GFX9; + } + bool hasCARRY() const { return (getGeneration() >= EVERGREEN); } @@ -255,6 +298,10 @@ public: return CaymanISA; } + TrapHandlerAbi getTrapHandlerAbi() const { + return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; + } + bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } @@ -267,20 +314,22 @@ public: return DumpCode; } - bool enableIEEEBit(const MachineFunction &MF) const { - return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); - } - /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. - unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const; + unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, + const Function &) const; /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if /// the given LDS memory size is the only constraint. - unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; + + unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const { + const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); + return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction()); + } bool hasFP16Denormals() const { - return FP16Denormals; + return FP64FP16Denormals; } bool hasFP32Denormals() const { @@ -288,13 +337,21 @@ public: } bool hasFP64Denormals() const { - return FP64Denormals; + return FP64FP16Denormals; } bool hasFPExceptions() const { return FPExceptions; } + bool enableDX10Clamp() const { + return DX10Clamp; + } + + bool enableIEEEBit(const MachineFunction &MF) const { + return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); + } + bool useFlatForGlobal() const { return FlatForGlobal; } @@ -307,10 +364,22 @@ public: return UnalignedScratchAccess; } + bool hasApertureRegs() const { + return HasApertureRegs; + } + + bool isTrapHandlerEnabled() const { + return TrapHandler; + } + bool isXNACKEnabled() const { return EnableXNACK; } + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } + bool isMesaKernel(const MachineFunction &MF) const { return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv()); } @@ -324,6 +393,10 @@ public: return isAmdHsaOS() || isMesaKernel(MF); } + bool hasFminFmaxLegacy() const { + return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { @@ -355,72 +428,71 @@ public: return true; } + void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} + bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} + /// \returns Number of execution units per compute unit supported by the /// subtarget. unsigned getEUsPerCU() const { - return 4; + return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits()); } /// \returns Maximum number of work groups per compute unit supported by the - /// subtarget and limited by given flat work group size. + /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { - if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 8; - return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16; + return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(), + FlatWorkGroupSize); } /// \returns Maximum number of waves per compute unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerCU() const { - return getMaxWavesPerEU() * getEUsPerCU(); + return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits()); } /// \returns Maximum number of waves per compute unit supported by the - /// subtarget and limited by given flat work group size. + /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { - return getWavesPerWorkGroup(FlatWorkGroupSize); + return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(), + FlatWorkGroupSize); } /// \returns Minimum number of waves per execution unit supported by the /// subtarget. unsigned getMinWavesPerEU() const { - return 1; + return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits()); } /// \returns Maximum number of waves per execution unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerEU() const { - if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 8; - // FIXME: Need to take scratch memory into account. - return 10; + return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits()); } /// \returns Maximum number of waves per execution unit supported by the - /// subtarget and limited by given flat work group size. + /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { - return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) / - getEUsPerCU(); + return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(), + FlatWorkGroupSize); } /// \returns Minimum flat work group size supported by the subtarget. unsigned getMinFlatWorkGroupSize() const { - return 1; + return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits()); } /// \returns Maximum flat work group size supported by the subtarget. unsigned getMaxFlatWorkGroupSize() const { - return 2048; + return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits()); } - /// \returns Number of waves per work group given the flat work group size. + /// \returns Number of waves per work group supported by the subtarget and + /// limited by given \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { - return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize(); + return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(), + FlatWorkGroupSize); } - void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} - bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} - /// \returns Subtarget's default pair of minimum/maximum flat work group sizes /// for function \p F, or minimum/maximum flat work group sizes explicitly /// requested using "amdgpu-flat-work-group-size" attribute attached to @@ -440,6 +512,9 @@ public: /// compatible with minimum/maximum number of waves limited by flat work group /// size, register usage, and/or lds usage. std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; + + /// Creates value range metadata on an workitemid.* inrinsic call or load. + bool makeLIDRangeMetadata(Instruction *I) const; }; class R600Subtarget final : public AMDGPUSubtarget { @@ -482,13 +557,6 @@ public: }; class SISubtarget final : public AMDGPUSubtarget { -public: - enum { - // The closed Vulkan driver sets 96, which limits the wave count to 8 but - // doesn't spill SGPRs as much as when 80 is set. - FIXED_SGPR_COUNT_FOR_INIT_BUG = 96 - }; - private: SIInstrInfo InstrInfo; SIFrameLowering FrameLowering; @@ -516,6 +584,21 @@ public: return GISel->getCallLowering(); } + const InstructionSelector *getInstructionSelector() const override { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getInstructionSelector(); + } + + const LegalizerInfo *getLegalizerInfo() const override { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getLegalizerInfo(); + } + + const RegisterBankInfo *getRegBankInfo() const override { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getRegBankInfo(); + } + const SIRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); } @@ -524,6 +607,11 @@ public: this->GISel.reset(&GISel); } + // XXX - Why is this here if it isn't in the default pass set? + bool enableEarlyIfConversion() const override { + return true; + } + void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; @@ -533,10 +621,6 @@ public: return 16; } - bool hasFlatAddressSpace() const { - return FlatAddressSpace; - } - bool hasSMemRealTime() const { return HasSMemRealTime; } @@ -549,6 +633,10 @@ public: return HasVGPRIndexMode; } + bool useVGPRIndexMode(bool UserEnable) const { + return !hasMovrel() || (UserEnable && hasVGPRIndexMode()); + } + bool hasScalarCompareEq64() const { return getGeneration() >= VOLCANIC_ISLANDS; } @@ -561,6 +649,14 @@ public: return HasInv2PiInlineImm; } + bool hasSDWA() const { + return HasSDWA; + } + + bool hasDPP() const { + return HasDPP; + } + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -594,6 +690,14 @@ public: return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; } + bool hasSMovFedHazard() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + + bool hasReadM0Hazard() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const; /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs @@ -605,10 +709,107 @@ public: /// \returns True if waitcnt instruction is needed before barrier instruction, /// false otherwise. bool needWaitcntBeforeBarrier() const { - return true; + return getGeneration() < GFX9; + } + + /// \returns true if the flat_scratch register should be initialized with the + /// pointer to the wave's scratch memory rather than a size and offset. + bool flatScratchIsPointer() const { + return getGeneration() >= GFX9; + } + + /// \returns SGPR allocation granularity supported by the subtarget. + unsigned getSGPRAllocGranule() const { + return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits()); + } + + /// \returns SGPR encoding granularity supported by the subtarget. + unsigned getSGPREncodingGranule() const { + return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits()); } - unsigned getMaxNumSGPRs() const; + /// \returns Total number of SGPRs supported by the subtarget. + unsigned getTotalNumSGPRs() const { + return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits()); + } + + /// \returns Addressable number of SGPRs supported by the subtarget. + unsigned getAddressableNumSGPRs() const { + return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits()); + } + + /// \returns Minimum number of SGPRs that meets the given number of waves per + /// execution unit requirement supported by the subtarget. + unsigned getMinNumSGPRs(unsigned WavesPerEU) const { + return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU); + } + + /// \returns Maximum number of SGPRs that meets the given number of waves per + /// execution unit requirement supported by the subtarget. + unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { + return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU, + Addressable); + } + + /// \returns Reserved number of SGPRs for given function \p MF. + unsigned getReservedNumSGPRs(const MachineFunction &MF) const; + + /// \returns Maximum number of SGPRs that meets number of waves per execution + /// unit requirement for function \p MF, or number of SGPRs explicitly + /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. + /// + /// \returns Value that meets number of waves per execution unit requirement + /// if explicitly requested value cannot be converted to integer, violates + /// subtarget's specifications, or does not meet number of waves per execution + /// unit requirement. + unsigned getMaxNumSGPRs(const MachineFunction &MF) const; + + /// \returns VGPR allocation granularity supported by the subtarget. + unsigned getVGPRAllocGranule() const { + return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits());; + } + + /// \returns VGPR encoding granularity supported by the subtarget. + unsigned getVGPREncodingGranule() const { + return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits()); + } + + /// \returns Total number of VGPRs supported by the subtarget. + unsigned getTotalNumVGPRs() const { + return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits()); + } + + /// \returns Addressable number of VGPRs supported by the subtarget. + unsigned getAddressableNumVGPRs() const { + return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits()); + } + + /// \returns Minimum number of VGPRs that meets given number of waves per + /// execution unit requirement supported by the subtarget. + unsigned getMinNumVGPRs(unsigned WavesPerEU) const { + return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU); + } + + /// \returns Maximum number of VGPRs that meets given number of waves per + /// execution unit requirement supported by the subtarget. + unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { + return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU); + } + + /// \returns Reserved number of VGPRs for given function \p MF. + unsigned getReservedNumVGPRs(const MachineFunction &MF) const { + return debuggerReserveRegs() ? 4 : 0; + } + + /// \returns Maximum number of VGPRs that meets number of waves per execution + /// unit requirement for function \p MF, or number of VGPRs explicitly + /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. + /// + /// \returns Value that meets number of waves per execution unit requirement + /// if explicitly requested value cannot be converted to integer, violates + /// subtarget's specifications, or does not meet number of waves per execution + /// unit requirement. + unsigned getMaxNumVGPRs(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d8a0c716279c..0202220b8011 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -15,24 +15,29 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" +#include "AMDGPUAliasAnalysis.h" #include "AMDGPUCallLowering.h" +#include "AMDGPUInstructionSelector.h" +#include "AMDGPULegalizerInfo.h" +#ifdef LLVM_BUILD_GLOBAL_ISEL +#include "AMDGPURegisterBankInfo.h" +#endif #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" +#include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" #include "R600MachineScheduler.h" #include "SIMachineScheduler.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Triple.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" -#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize.h" @@ -58,6 +63,11 @@ static cl::opt<bool> EnableSROA( cl::ReallyHidden, cl::init(true)); +static cl::opt<bool> +EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, + cl::desc("Run early if-conversion"), + cl::init(false)); + static cl::opt<bool> EnableR600IfConvert( "r600-if-convert", cl::desc("Use if conversion pass"), @@ -78,6 +88,36 @@ static cl::opt<bool> ScalarizeGlobal( cl::init(false), cl::Hidden); +// Option to run internalize pass. +static cl::opt<bool> InternalizeSymbols( + "amdgpu-internalize-symbols", + cl::desc("Enable elimination of non-kernel functions and unused globals"), + cl::init(false), + cl::Hidden); + +// Option to inline all early. +static cl::opt<bool> EarlyInlineAll( + "amdgpu-early-inline-all", + cl::desc("Inline all functions early"), + cl::init(false), + cl::Hidden); + +static cl::opt<bool> EnableSDWAPeephole( + "amdgpu-sdwa-peephole", + cl::desc("Enable SDWA peepholer"), + cl::init(true)); + +// Enable address space based alias analysis +static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, + cl::desc("Enable AMDGPU Alias Analysis"), + cl::init(true)); + +// Option to enable new waitcnt insertion pass. +static cl::opt<bool> EnableSIInsertWaitcntsPass( + "enable-si-insert-waitcnts", + cl::desc("Use new waitcnt insertion pass"), + cl::init(false)); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); @@ -86,22 +126,28 @@ extern "C" void LLVMInitializeAMDGPUTarget() { PassRegistry *PR = PassRegistry::getPassRegistry(); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); + initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); + initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); + initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitsPass(*PR); + initializeSIInsertWaitcntsPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); + initializeAMDGPUUnifyDivergentExitNodesPass(*PR); + initializeAMDGPUAAWrapperPassPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -119,13 +165,26 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = - new ScheduleDAGMILive(C, - llvm::make_unique<GCNMaxOccupancySchedStrategy>(C)); + new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } +static ScheduleDAGInstrs * +createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + auto DAG = new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + return DAG; +} + +static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { + return new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_MINREGFORCED); +} + static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); @@ -139,6 +198,16 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler); +static MachineSchedRegistry +IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", + "Run GCN scheduler to maximize occupancy (experimental)", + createIterativeGCNMaxOccupancyMachineScheduler); + +static MachineSchedRegistry +GCNMinRegSchedRegistry("gcn-minreg", + "Run GCN iterative scheduler for minimal register usage (experimental)", + createMinRegScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. @@ -148,9 +217,14 @@ static StringRef computeDataLayout(const Triple &TT) { // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + if (TT.getEnvironmentName() == "amdgiz" || + TT.getEnvironmentName() == "amdgizcl") + return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } LLVM_READNONE @@ -180,6 +254,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), TLOF(createTLOF(getTargetTriple())) { + AS = AMDGPU::getAMDGPUAS(TT); initAsmInfo(); } @@ -199,8 +274,65 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { FSAttr.getValueAsString(); } -void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) { - PM.add(createAMDGPUUnifyMetadataPass()); +static ImmutablePass *createAMDGPUExternalAAWrapperPass() { + return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { + if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + }); +} + +void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { + Builder.DivergentTarget = true; + + bool Internalize = InternalizeSymbols && + (getOptLevel() > CodeGenOpt::None) && + (getTargetTriple().getArch() == Triple::amdgcn); + bool EarlyInline = EarlyInlineAll && + (getOptLevel() > CodeGenOpt::None); + bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None; + + Builder.addExtension( + PassManagerBuilder::EP_ModuleOptimizerEarly, + [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { + if (AMDGPUAA) { + PM.add(createAMDGPUAAWrapperPass()); + PM.add(createAMDGPUExternalAAWrapperPass()); + } + PM.add(createAMDGPUUnifyMetadataPass()); + if (Internalize) { + PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool { + if (const Function *F = dyn_cast<Function>(&GV)) { + if (F->isDeclaration()) + return true; + switch (F->getCallingConv()) { + default: + return false; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + } + } + return !GV.use_empty(); + })); + PM.add(createGlobalDCEPass()); + } + if (EarlyInline) + PM.add(createAMDGPUAlwaysInlinePass(false)); + }); + + Builder.addExtension( + PassManagerBuilder::EP_EarlyAsPossible, + [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + if (AMDGPUAA) { + PM.add(createAMDGPUAAWrapperPass()); + PM.add(createAMDGPUExternalAAWrapperPass()); + } + }); } //===----------------------------------------------------------------------===// @@ -245,9 +377,21 @@ namespace { struct SIGISelActualAccessor : public GISelAccessor { std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; + std::unique_ptr<InstructionSelector> InstSelector; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; const AMDGPUCallLowering *getCallLowering() const override { return CallLoweringInfo.get(); } + const InstructionSelector *getInstructionSelector() const override { + return InstSelector.get(); + } + const LegalizerInfo *getLegalizerInfo() const override { + return Legalizer.get(); + } + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); + } }; } // end anonymous namespace @@ -281,6 +425,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); GISel->CallLoweringInfo.reset( new AMDGPUCallLowering(*I->getTargetLowering())); + GISel->Legalizer.reset(new AMDGPULegalizerInfo()); + + GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*I->getRegisterInfo())); + GISel->InstSelector.reset(new AMDGPUInstructionSelector(*I, + *static_cast<AMDGPURegisterBankInfo*>(GISel->RegBankInfo.get()))); #endif I->setGISelAccessor(*GISel); @@ -356,9 +505,9 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override; - void addIRPasses() override; bool addPreISel() override; void addMachineSSAOptimization() override; + bool addILPOpts() override; bool addInstSelector() override; #ifdef LLVM_BUILD_GLOBAL_ISEL bool addIRTranslator() override; @@ -406,11 +555,15 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { } void AMDGPUPassConfig::addIRPasses() { + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + // There is no reason to run these. disablePass(&StackMapLivenessID); disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + addPass(createAMDGPULowerIntrinsicsPass(&TM)); + // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerLegacyPass()); @@ -421,17 +574,33 @@ void AMDGPUPassConfig::addIRPasses() { // without ever running any passes on the second. addPass(createBarrierNoopPass()); + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { + // TODO: May want to move later or split into an early and late one. + + addPass(createAMDGPUCodeGenPreparePass( + static_cast<const GCNTargetMachine *>(&TM))); + } + // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); - const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); if (TM.getOptLevel() > CodeGenOpt::None) { + addPass(createInferAddressSpacesPass()); addPass(createAMDGPUPromoteAlloca(&TM)); if (EnableSROA) addPass(createSROAPass()); addStraightLineScalarOptimizationPasses(); + + if (EnableAMDGPUAliasAnalysis) { + addPass(createAMDGPUAAWrapperPass()); + addPass(createExternalAAWrapperPass([](Pass &P, Function &, + AAResults &AAR) { + if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + })); + } } TargetPassConfig::addIRPasses(); @@ -526,7 +695,12 @@ bool GCNPassConfig::addPreISel() { // FIXME: We need to run a pass to propagate the attributes when calls are // supported. - addPass(&AMDGPUAnnotateKernelFeaturesID); + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM)); + + // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit + // regions formed by them. + addPass(&AMDGPUUnifyDivergentExitNodesID); addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions addPass(createSinkingPass()); addPass(createSITypeRewriter()); @@ -549,13 +723,19 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&SIFoldOperandsID); addPass(&DeadMachineInstructionElimID); addPass(&SILoadStoreOptimizerID); + addPass(createSIShrinkInstructionsPass()); + if (EnableSDWAPeephole) { + addPass(&SIPeepholeSDWAID); + addPass(&DeadMachineInstructionElimID); + } } -void GCNPassConfig::addIRPasses() { - // TODO: May want to move later or split into an early and late one. - addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); +bool GCNPassConfig::addILPOpts() { + if (EnableEarlyIfConversion) + addPass(&EarlyIfConverterID); - AMDGPUPassConfig::addIRPasses(); + TargetPassConfig::addILPOpts(); + return false; } bool GCNPassConfig::addInstSelector() { @@ -572,20 +752,23 @@ bool GCNPassConfig::addIRTranslator() { } bool GCNPassConfig::addLegalizeMachineIR() { + addPass(new Legalizer()); return false; } bool GCNPassConfig::addRegBankSelect() { + addPass(new RegBankSelect()); return false; } bool GCNPassConfig::addGlobalInstructionSelect() { + addPass(new InstructionSelect()); return false; } + #endif void GCNPassConfig::addPreRegAlloc() { - addPass(createSIShrinkInstructionsPass()); addPass(createSIWholeQuadModePass()); } @@ -615,6 +798,7 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { } void GCNPassConfig::addPostRegAlloc() { + addPass(&SIFixVGPRCopiesID); addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); } @@ -633,7 +817,10 @@ void GCNPassConfig::addPreEmitPass() { // cases. addPass(&PostRAHazardRecognizerID); - addPass(createSIInsertWaitsPass()); + if (EnableSIInsertWaitcntsPass) + addPass(createSIInsertWaitcntsPass()); + else + addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); addPass(createSIDebuggerInsertNopsPass()); @@ -643,3 +830,4 @@ void GCNPassConfig::addPreEmitPass() { TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { return new GCNPassConfig(this, PM); } + diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 9496773a073f..934bf7f31bab 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -35,6 +35,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { protected: std::unique_ptr<TargetLoweringObjectFile> TLOF; AMDGPUIntrinsicInfo IntrinsicInfo; + AMDGPUAS AS; StringRef getGPUName(const Function &F) const; StringRef getFeatureString(const Function &F) const; @@ -57,7 +58,18 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } - void addEarlyAsPossiblePasses(PassManagerBase &PM) override; + AMDGPUAS getAMDGPUAS() const { + return AS; + } + + void adjustPassManager(PassManagerBuilder &) override; + /// Get the integer value of a null pointer in the given address space. + uint64_t getNullPointerValue(unsigned AddrSpace) const { + if (AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS) + return -1; + return 0; + } + }; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index 1fddc88a705a..c96761c0b04e 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPUTargetMachine.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" #include "llvm/MC/MCContext.h" @@ -22,7 +23,8 @@ using namespace llvm; MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { - if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) && + auto AS = static_cast<const AMDGPUTargetMachine*>(&TM)->getAMDGPUAS(); + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO, AS) && AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple())) return TextSection; diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index de327786dff6..ca6210f69298 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -16,6 +16,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H +#include "AMDGPU.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/Target/TargetMachine.h" diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index e90487065992..01ac9968181a 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -29,6 +29,39 @@ using namespace llvm; #define DEBUG_TYPE "AMDGPUtti" +static cl::opt<unsigned> UnrollThresholdPrivate( + "amdgpu-unroll-threshold-private", + cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), + cl::init(2500), cl::Hidden); + +static cl::opt<unsigned> UnrollThresholdLocal( + "amdgpu-unroll-threshold-local", + cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), + cl::init(1000), cl::Hidden); + +static cl::opt<unsigned> UnrollThresholdIf( + "amdgpu-unroll-threshold-if", + cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), + cl::init(150), cl::Hidden); + +static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, + unsigned Depth = 0) { + const Instruction *I = dyn_cast<Instruction>(Cond); + if (!I) + return false; + + for (const Value *V : I->operand_values()) { + if (!L->contains(I)) + continue; + if (const PHINode *PHI = dyn_cast<PHINode>(V)) { + if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { + return SubLoop->contains(PHI); })) + return true; + } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) + return true; + } + return false; +} void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { @@ -38,29 +71,115 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, // TODO: Do we want runtime unrolling? + // Maximum alloca size than can fit registers. Reserve 16 registers. + const unsigned MaxAlloca = (256 - 16) * 4; + unsigned ThresholdPrivate = UnrollThresholdPrivate; + unsigned ThresholdLocal = UnrollThresholdLocal; + unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); + AMDGPUAS ASST = ST->getAMDGPUAS(); for (const BasicBlock *BB : L->getBlocks()) { const DataLayout &DL = BB->getModule()->getDataLayout(); + unsigned LocalGEPsSeen = 0; + + if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { + return SubLoop->contains(BB); })) + continue; // Block belongs to an inner loop. + for (const Instruction &I : *BB) { + + // Unroll a loop which contains an "if" statement whose condition + // defined by a PHI belonging to the loop. This may help to eliminate + // if region and potentially even PHI itself, saving on both divergence + // and registers used for the PHI. + // Add a small bonus for each of such "if" statements. + if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { + if (UP.Threshold < MaxBoost && Br->isConditional()) { + if (L->isLoopExiting(Br->getSuccessor(0)) || + L->isLoopExiting(Br->getSuccessor(1))) + continue; + if (dependsOnLocalPhi(L, Br->getCondition())) { + UP.Threshold += UnrollThresholdIf; + DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold + << " for loop:\n" << *L << " due to " << *Br << '\n'); + if (UP.Threshold >= MaxBoost) + return; + } + } + continue; + } + const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); - if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + if (!GEP) + continue; + + unsigned AS = GEP->getAddressSpace(); + unsigned Threshold = 0; + if (AS == ASST.PRIVATE_ADDRESS) + Threshold = ThresholdPrivate; + else if (AS == ASST.LOCAL_ADDRESS) + Threshold = ThresholdLocal; + else + continue; + + if (UP.Threshold >= Threshold) continue; - const Value *Ptr = GEP->getPointerOperand(); - const AllocaInst *Alloca = - dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); - if (Alloca) { - // We want to do whatever we can to limit the number of alloca - // instructions that make it through to the code generator. allocas - // require us to use indirect addressing, which is slow and prone to - // compiler bugs. If this loop does an address calculation on an - // alloca ptr, then we want to use a higher than normal loop unroll - // threshold. This will give SROA a better chance to eliminate these - // allocas. - // - // Don't use the maximum allowed value here as it will make some - // programs way too big. - UP.Threshold = 800; + if (AS == ASST.PRIVATE_ADDRESS) { + const Value *Ptr = GEP->getPointerOperand(); + const AllocaInst *Alloca = + dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); + if (!Alloca || !Alloca->isStaticAlloca()) + continue; + Type *Ty = Alloca->getAllocatedType(); + unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; + if (AllocaSize > MaxAlloca) + continue; + } else if (AS == ASST.LOCAL_ADDRESS) { + LocalGEPsSeen++; + // Inhibit unroll for local memory if we have seen addressing not to + // a variable, most likely we will be unable to combine it. + // Do not unroll too deep inner loops for local memory to give a chance + // to unroll an outer loop for a more important reason. + if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || + (!isa<GlobalVariable>(GEP->getPointerOperand()) && + !isa<Argument>(GEP->getPointerOperand()))) + continue; } + + // Check if GEP depends on a value defined by this loop itself. + bool HasLoopDef = false; + for (const Value *Op : GEP->operands()) { + const Instruction *Inst = dyn_cast<Instruction>(Op); + if (!Inst || L->isLoopInvariant(Op)) + continue; + + if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { + return SubLoop->contains(Inst); })) + continue; + HasLoopDef = true; + break; + } + if (!HasLoopDef) + continue; + + // We want to do whatever we can to limit the number of alloca + // instructions that make it through to the code generator. allocas + // require us to use indirect addressing, which is slow and prone to + // compiler bugs. If this loop does an address calculation on an + // alloca ptr, then we want to use a higher than normal loop unroll + // threshold. This will give SROA a better chance to eliminate these + // allocas. + // + // We also want to have more unrolling for local memory to let ds + // instructions with different offsets combine. + // + // Don't use the maximum allowed value here as it will make some + // programs way too big. + UP.Threshold = Threshold; + DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n" + << *L << " due to " << *GEP << '\n'); + if (UP.Threshold >= MaxBoost) + return; } } } @@ -81,28 +200,56 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { } unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { - switch (AddrSpace) { - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::FLAT_ADDRESS: + AMDGPUAS AS = ST->getAMDGPUAS(); + if (AddrSpace == AS.GLOBAL_ADDRESS || + AddrSpace == AS.CONSTANT_ADDRESS || + AddrSpace == AS.FLAT_ADDRESS) return 128; - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: + if (AddrSpace == AS.LOCAL_ADDRESS || + AddrSpace == AS.REGION_ADDRESS) return 64; - case AMDGPUAS::PRIVATE_ADDRESS: + if (AddrSpace == AS.PRIVATE_ADDRESS) return 8 * ST->getMaxPrivateElementSize(); - default: - if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && - (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || - AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || - (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && - AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) - return 128; - llvm_unreachable("unhandled address space"); + + if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && + (AddrSpace == AS.PARAM_D_ADDRESS || + AddrSpace == AS.PARAM_I_ADDRESS || + (AddrSpace >= AS.CONSTANT_BUFFER_0 && + AddrSpace <= AS.CONSTANT_BUFFER_15))) + return 128; + llvm_unreachable("unhandled address space"); +} + +bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + // We allow vectorization of flat stores, even though we may need to decompose + // them later if they may access private memory. We don't have enough context + // here, and legalization can handle it. + if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) { + return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && + ChainSizeInBytes <= ST->getMaxPrivateElementSize(); } + return true; +} + +bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); +} + +bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { + // Disable unrolling if the loop is not vectorized. + if (VF == 1) + return 1; + // Semi-arbitrary large amount. return 64; } @@ -228,16 +375,8 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, } } -static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, - const IntrinsicInst *I) { +static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) { switch (I->getIntrinsicID()) { - default: - return false; - case Intrinsic::not_intrinsic: - // This means we have an intrinsic that isn't defined in - // IntrinsicsAMDGPU.td - break; - case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::amdgcn_workitem_id_z: @@ -249,6 +388,8 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, case Intrinsic::r600_read_tidig_x: case Intrinsic::r600_read_tidig_y: case Intrinsic::r600_read_tidig_z: + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_image_atomic_swap: case Intrinsic::amdgcn_image_atomic_add: case Intrinsic::amdgcn_image_atomic_sub: @@ -274,16 +415,10 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_buffer_atomic_cmpswap: case Intrinsic::amdgcn_ps_live: + case Intrinsic::amdgcn_ds_swizzle: return true; - } - - StringRef Name = I->getCalledFunction()->getName(); - switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { default: return false; - case AMDGPUIntrinsic::SI_fs_interp: - case AMDGPUIntrinsic::SI_fs_constant: - return true; } } @@ -295,8 +430,8 @@ static bool isArgPassedInSGPR(const Argument *A) { return true; // For non-compute shaders, SGPR inputs are marked with either inreg or byval. - if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || - F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) + if (F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || + F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal)) return true; // Everything else is in VGPRs. @@ -318,7 +453,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { // All other loads are not divergent, because if threads issue loads with the // same arguments, they will always get the same result. if (const LoadInst *Load = dyn_cast<LoadInst>(V)) - return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS; // Atomics are divergent because they are executed sequentially: when an // atomic operation refers to the same address in each thread, then each @@ -327,10 +462,8 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) return true; - if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { - const TargetMachine &TM = getTLI()->getTargetMachine(); - return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); - } + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) + return isIntrinsicSourceOfDivergence(Intrinsic); // Assume all function calls are a source of divergence. if (isa<CallInst>(V) || isa<InvokeInst>(V)) diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 0d83b2a585bf..71d6306bc1a5 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -32,6 +32,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { const AMDGPUSubtarget *ST; const AMDGPUTargetLowering *TLI; + bool IsGraphicsShader; const AMDGPUSubtarget *getST() const { return ST; } const AMDGPUTargetLowering *getTLI() const { return TLI; } @@ -62,7 +63,8 @@ public: explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), - TLI(ST->getTargetLowering()) {} + TLI(ST->getTargetLowering()), + IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {} bool hasBranchDivergence() { return true; } @@ -76,6 +78,17 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; + + bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + unsigned getMaxInterleaveFactor(unsigned VF); int getArithmeticInstrCost( @@ -91,6 +104,15 @@ public: int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); bool isSourceOfDivergence(const Value *V) const; + unsigned getFlatAddressSpace() const { + // Don't bother running InferAddressSpaces pass on graphics shaders which + // don't use flat addressing. + if (IsGraphicsShader) + return -1; + return ST->hasFlatAddressSpace() ? + ST->getAMDGPUAS().FLAT_ADDRESS : ST->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE; + } + unsigned getVectorSplitCost() { return 0; } }; diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp new file mode 100644 index 000000000000..309913f87fb6 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -0,0 +1,225 @@ +//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring +// there is at most one ret and one unreachable instruction, it ensures there is +// at most one divergent exiting block. +// +// StructurizeCFG can't deal with multi-exit regions formed by branches to +// multiple return nodes. It is not desirable to structurize regions with +// uniform branches, so unifying those to the same return block as divergent +// branches inhibits use of scalar branching. It still can't deal with the case +// where one branch goes to return, and one unreachable. Replace unreachable in +// this case with a return. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes" + +namespace { + +class AMDGPUUnifyDivergentExitNodes : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { + initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry()); + } + + // We can preserve non-critical-edgeness when we unify function exit nodes + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; + +} + +char AMDGPUUnifyDivergentExitNodes::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, + "Unify divergent function exit nodes", false, false) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, + "Unify divergent function exit nodes", false, false) + +char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID; + +void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ + // TODO: Preserve dominator tree. + AU.addRequired<PostDominatorTreeWrapperPass>(); + + AU.addRequired<DivergenceAnalysis>(); + + // No divergent values are changed, only blocks and branch edges. + AU.addPreserved<DivergenceAnalysis>(); + + // We preserve the non-critical-edgeness property + AU.addPreservedID(BreakCriticalEdgesID); + + // This is a cluster of orthogonal Transforms + AU.addPreservedID(LowerSwitchID); + FunctionPass::getAnalysisUsage(AU); + + AU.addRequired<TargetTransformInfoWrapperPass>(); +} + +/// \returns true if \p BB is reachable through only uniform branches. +/// XXX - Is there a more efficient way to find this? +static bool isUniformlyReached(const DivergenceAnalysis &DA, + BasicBlock &BB) { + SmallVector<BasicBlock *, 8> Stack; + SmallPtrSet<BasicBlock *, 8> Visited; + + for (BasicBlock *Pred : predecessors(&BB)) + Stack.push_back(Pred); + + while (!Stack.empty()) { + BasicBlock *Top = Stack.pop_back_val(); + if (!DA.isUniform(Top->getTerminator())) + return false; + + for (BasicBlock *Pred : predecessors(Top)) { + if (Visited.insert(Pred).second) + Stack.push_back(Pred); + } + } + + return true; +} + +static BasicBlock *unifyReturnBlockSet(Function &F, + ArrayRef<BasicBlock *> ReturningBlocks, + const TargetTransformInfo &TTI, + StringRef Name) { + // Otherwise, we need to insert a new basic block into the function, add a PHI + // nodes (if the function returns values), and convert all of the return + // instructions into unconditional branches. + // + BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); + + PHINode *PN = nullptr; + if (F.getReturnType()->isVoidTy()) { + ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); + } else { + // If the function doesn't return void... add a PHI node to the block... + PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), + "UnifiedRetVal"); + NewRetBlock->getInstList().push_back(PN); + ReturnInst::Create(F.getContext(), PN, NewRetBlock); + } + + // Loop over all of the blocks, replacing the return instruction with an + // unconditional branch. + // + for (BasicBlock *BB : ReturningBlocks) { + // Add an incoming element to the PHI node for every return instruction that + // is merging into this new block... + if (PN) + PN->addIncoming(BB->getTerminator()->getOperand(0), BB); + + BB->getInstList().pop_back(); // Remove the return insn + BranchInst::Create(NewRetBlock, BB); + } + + for (BasicBlock *BB : ReturningBlocks) { + // Cleanup possible branch to unconditional branch to the return. + SimplifyCFG(BB, TTI, 2); + } + + return NewRetBlock; +} + +bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { + auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); + if (PDT.getRoots().size() <= 1) + return false; + + DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>(); + + // Loop over all of the blocks in a function, tracking all of the blocks that + // return. + // + SmallVector<BasicBlock *, 4> ReturningBlocks; + SmallVector<BasicBlock *, 4> UnreachableBlocks; + + for (BasicBlock *BB : PDT.getRoots()) { + if (isa<ReturnInst>(BB->getTerminator())) { + if (!isUniformlyReached(DA, *BB)) + ReturningBlocks.push_back(BB); + } else if (isa<UnreachableInst>(BB->getTerminator())) { + if (!isUniformlyReached(DA, *BB)) + UnreachableBlocks.push_back(BB); + } + } + + if (!UnreachableBlocks.empty()) { + BasicBlock *UnreachableBlock = nullptr; + + if (UnreachableBlocks.size() == 1) { + UnreachableBlock = UnreachableBlocks.front(); + } else { + UnreachableBlock = BasicBlock::Create(F.getContext(), + "UnifiedUnreachableBlock", &F); + new UnreachableInst(F.getContext(), UnreachableBlock); + + for (BasicBlock *BB : UnreachableBlocks) { + BB->getInstList().pop_back(); // Remove the unreachable inst. + BranchInst::Create(UnreachableBlock, BB); + } + } + + if (!ReturningBlocks.empty()) { + // Don't create a new unreachable inst if we have a return. The + // structurizer/annotator can't handle the multiple exits + + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); + UnreachableBlock->getInstList().pop_back(); // Remove the unreachable inst. + + Function *UnreachableIntrin = + Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable); + + // Insert a call to an intrinsic tracking that this is an unreachable + // point, in case we want to kill the active lanes or something later. + CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock); + + // Don't create a scalar trap. We would only want to trap if this code was + // really reached, but a scalar trap would happen even if no lanes + // actually reached here. + ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock); + ReturningBlocks.push_back(UnreachableBlock); + } + } + + // Now handle return blocks. + if (ReturningBlocks.empty()) + return false; // No blocks return + + if (ReturningBlocks.size() == 1) + return false; // Already has a single return block + + const TargetTransformInfo &TTI + = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock"); + return true; +} diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp index bf501a1e8405..3a0c3ede08f4 100644 --- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp @@ -13,38 +13,39 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include <algorithm> +#include <cassert> using namespace llvm; namespace { + namespace kOCLMD { + const char SpirVer[] = "opencl.spir.version"; const char OCLVer[] = "opencl.ocl.version"; const char UsedExt[] = "opencl.used.extensions"; const char UsedOptCoreFeat[] = "opencl.used.optional.core.features"; const char CompilerOptions[] = "opencl.compiler.options"; const char LLVMIdent[] = "llvm.ident"; - } + + } // end namespace kOCLMD /// \brief Unify multiple OpenCL metadata due to linking. - class AMDGPUUnifyMetadata : public FunctionPass { + class AMDGPUUnifyMetadata : public ModulePass { public: static char ID; - explicit AMDGPUUnifyMetadata() : FunctionPass(ID) {}; + explicit AMDGPUUnifyMetadata() : ModulePass(ID) {}; private: - // This should really be a module pass but we have to run it as early - // as possible, so given function passes are executed first and - // TargetMachine::addEarlyAsPossiblePasses() expects only function passes - // it has to be a function pass. virtual bool runOnModule(Module &M); - // \todo: Convert to a module pass. - virtual bool runOnFunction(Function &F); - /// \brief Unify version metadata. /// \return true if changes are made. /// Assume the named metadata has operands each of which is a pair of @@ -117,7 +118,7 @@ INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata", "Unify multiple OpenCL metadata due to linking", false, false) -FunctionPass* llvm::createAMDGPUUnifyMetadataPass() { +ModulePass* llvm::createAMDGPUUnifyMetadataPass() { return new AMDGPUUnifyMetadata(); } @@ -143,7 +144,3 @@ bool AMDGPUUnifyMetadata::runOnModule(Module &M) { return Changed; } - -bool AMDGPUUnifyMetadata::runOnFunction(Function &F) { - return runOnModule(*F.getParent()); -} diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 7faeccdc5df3..1a393845a822 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -9,27 +9,40 @@ //==-----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "AMDGPUInstrInfo.h" #include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" +#include "R600RegisterInfo.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Dominators.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" +#include <cassert> +#include <cstddef> #include <deque> +#include <iterator> +#include <map> +#include <utility> +#include <vector> using namespace llvm; @@ -53,15 +66,19 @@ STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); namespace llvm { + void initializeAMDGPUCFGStructurizerPass(PassRegistry&); -} + +} // end namespace llvm + +namespace { //===----------------------------------------------------------------------===// // // Miscellaneous utility for CFGStructurizer. // //===----------------------------------------------------------------------===// -namespace { + #define SHOWNEWINSTR(i) \ DEBUG(dbgs() << "New instr: " << *i << "\n"); @@ -82,35 +99,19 @@ DEBUG( \ #define INVALIDSCCNUM -1 -template<class NodeT> -void ReverseVector(SmallVectorImpl<NodeT *> &Src) { - size_t sz = Src.size(); - for (size_t i = 0; i < sz/2; ++i) { - NodeT *t = Src[i]; - Src[i] = Src[sz - i - 1]; - Src[sz - i - 1] = t; - } -} - -} // end anonymous namespace - //===----------------------------------------------------------------------===// // // supporting data structure for CFGStructurizer // //===----------------------------------------------------------------------===// - -namespace { - class BlockInformation { public: - bool IsRetired; - int SccNum; - BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {} -}; + bool IsRetired = false; + int SccNum = INVALIDSCCNUM; -} // end anonymous namespace + BlockInformation() = default; +}; //===----------------------------------------------------------------------===// // @@ -118,7 +119,6 @@ public: // //===----------------------------------------------------------------------===// -namespace { class AMDGPUCFGStructurizer : public MachineFunctionPass { public: typedef SmallVector<MachineBasicBlock *, 32> MBBVector; @@ -133,8 +133,7 @@ public: static char ID; - AMDGPUCFGStructurizer() : - MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) { + AMDGPUCFGStructurizer() : MachineFunctionPass(ID) { initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); } @@ -167,7 +166,7 @@ public: MLI = &getAnalysis<MachineLoopInfo>(); DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); MDT = &getAnalysis<MachineDominatorTree>(); - DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr);); + DEBUG(MDT->print(dbgs(), (const Module*)nullptr);); PDT = &getAnalysis<MachinePostDominatorTree>(); DEBUG(PDT->print(dbgs());); prepare(); @@ -180,8 +179,8 @@ protected: MachineDominatorTree *MDT; MachinePostDominatorTree *PDT; MachineLoopInfo *MLI; - const R600InstrInfo *TII; - const R600RegisterInfo *TRI; + const R600InstrInfo *TII = nullptr; + const R600RegisterInfo *TRI = nullptr; // PRINT FUNCTIONS /// Print the ordered Blocks. @@ -198,6 +197,7 @@ protected: } } } + static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { for (MachineLoop::iterator iter = LoopInfo.begin(), iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { @@ -263,7 +263,6 @@ protected: MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk); static void wrapup(MachineBasicBlock *MBB); - int patternMatch(MachineBasicBlock *MBB); int patternMatchGroup(MachineBasicBlock *MBB); int serialPatternMatch(MachineBasicBlock *MBB); @@ -328,7 +327,6 @@ protected: void recordSccnum(MachineBasicBlock *MBB, int SCCNum); void retireBlock(MachineBasicBlock *MBB); - private: MBBInfoMap BlockInfoMap; LoopLandInfoMap LLInfoMap; @@ -337,6 +335,10 @@ private: SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks; }; +char AMDGPUCFGStructurizer::ID = 0; + +} // end anonymous namespace + int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); if (It == BlockInfoMap.end()) @@ -379,6 +381,7 @@ bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { } return false; } + AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo( MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, bool AllowSideEntry) const { @@ -697,10 +700,8 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but // there isn't such an interface yet. alternatively, replace all the other // blocks in the jump table with the entryBlk //} - } - bool AMDGPUCFGStructurizer::prepare() { bool Changed = false; @@ -748,7 +749,6 @@ bool AMDGPUCFGStructurizer::prepare() { } bool AMDGPUCFGStructurizer::run() { - //Assume reducible CFG... DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); @@ -886,8 +886,6 @@ bool AMDGPUCFGStructurizer::run() { return true; } - - void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { int SccNum = 0; MachineBasicBlock *MBB; @@ -903,11 +901,8 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { } } - //walk through all the block in func to check for unreachable - typedef GraphTraits<MachineFunction *> GTM; - auto It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF); - for (; It != E; ++It) { - MachineBasicBlock *MBB = *It; + // walk through all the block in func to check for unreachable + for (auto *MBB : nodes(MF)) { SccNum = getSCCNum(MBB); if (SccNum == INVALIDSCCNUM) dbgs() << "unreachable block BB" << MBB->getNumber() << "\n"; @@ -941,7 +936,6 @@ int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { return NumMatch; } - int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { if (MBB->succ_size() != 1) return 0; @@ -1039,7 +1033,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() { for (MachineLoop *ML : depth_first(It)) NestedLoops.push_front(ML); - if (NestedLoops.size() == 0) + if (NestedLoops.empty()) return 0; // Process nested loop outside->inside (we did push_front), @@ -1074,13 +1068,9 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { MachineBasicBlock *ExitBlk = *ExitBlks.begin(); assert(ExitBlk && "Loop has several exit block"); MBBVector LatchBlks; - typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits; - InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader), - PE = InvMBBTraits::child_end(LoopHeader); - for (; PI != PE; PI++) { - if (LoopRep->contains(*PI)) - LatchBlks.push_back(*PI); - } + for (auto *LB : inverse_children<MachineBasicBlock*>(LoopHeader)) + if (LoopRep->contains(LB)) + LatchBlks.push_back(LB); for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i) mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk); @@ -1217,7 +1207,7 @@ void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( } } - dbgs() << "\n"; + dbgs() << "\n"; } int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, @@ -1478,7 +1468,6 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, if (LandMBB && TrueMBB && FalseMBB) MBB->addSuccessor(LandMBB); - } void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, @@ -1491,7 +1480,6 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, DstBlk->replaceSuccessor(DstBlk, LandMBB); } - void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, MachineBasicBlock *LandMBB) { DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber() @@ -1727,11 +1715,6 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { && "can't retire block yet"); } -char AMDGPUCFGStructurizer::ID = 0; - -} // end anonymous namespace - - INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer", "AMDGPU CFG Structurizer", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 3cf9a1d92469..961f7186f373 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -16,6 +16,7 @@ #include "Utils/AMDGPUAsmUtils.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/STLExtras.h" @@ -39,15 +40,12 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/MathExtras.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -56,7 +54,6 @@ #include <map> #include <memory> #include <string> -#include <vector> using namespace llvm; using namespace llvm::AMDGPU; @@ -83,7 +80,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { const AMDGPUAsmParser *AsmParser; public: - AMDGPUOperand(enum KindTy Kind_, const AMDGPUAsmParser *AsmParser_) + AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_) : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {} typedef std::unique_ptr<AMDGPUOperand> Ptr; @@ -160,7 +157,11 @@ public: ImmTySendMsg, ImmTyInterpSlot, ImmTyInterpAttr, - ImmTyAttrChan + ImmTyAttrChan, + ImmTyOpSel, + ImmTyOpSelHi, + ImmTyNegLo, + ImmTyNegHi }; struct TokOp { @@ -297,6 +298,10 @@ public: bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); } bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); } bool isAttrChan() const { return isImmTy(ImmTyAttrChan); } + bool isOpSel() const { return isImmTy(ImmTyOpSel); } + bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); } + bool isNegLo() const { return isImmTy(ImmTyNegLo); } + bool isNegHi() const { return isImmTy(ImmTyNegHi); } bool isMod() const { return isClampSI() || isOModSI(); @@ -316,6 +321,10 @@ public: return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16); } + bool isSCSrcV2B16() const { + return isSCSrcB16(); + } + bool isSCSrcB32() const { return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32); } @@ -328,6 +337,10 @@ public: return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16); } + bool isSCSrcV2F16() const { + return isSCSrcF16(); + } + bool isSCSrcF32() const { return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32); } @@ -344,6 +357,11 @@ public: return isSCSrcB16() || isLiteralImm(MVT::i16); } + bool isSSrcV2B16() const { + llvm_unreachable("cannot happen"); + return isSSrcB16(); + } + bool isSSrcB64() const { // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits. // See isVSrc64(). @@ -362,6 +380,11 @@ public: return isSCSrcB16() || isLiteralImm(MVT::f16); } + bool isSSrcV2F16() const { + llvm_unreachable("cannot happen"); + return isSSrcF16(); + } + bool isVCSrcB32() const { return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32); } @@ -374,6 +397,10 @@ public: return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16); } + bool isVCSrcV2B16() const { + return isVCSrcB16(); + } + bool isVCSrcF32() const { return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32); } @@ -386,6 +413,10 @@ public: return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16); } + bool isVCSrcV2F16() const { + return isVCSrcF16(); + } + bool isVSrcB32() const { return isVCSrcF32() || isLiteralImm(MVT::i32); } @@ -398,6 +429,11 @@ public: return isVCSrcF16() || isLiteralImm(MVT::i16); } + bool isVSrcV2B16() const { + llvm_unreachable("cannot happen"); + return isVSrcB16(); + } + bool isVSrcF32() const { return isVCSrcF32() || isLiteralImm(MVT::f32); } @@ -410,6 +446,11 @@ public: return isVCSrcF16() || isLiteralImm(MVT::f16); } + bool isVSrcV2F16() const { + llvm_unreachable("cannot happen"); + return isVSrcF16(); + } + bool isKImmFP32() const { return isLiteralImm(MVT::f32); } @@ -459,7 +500,7 @@ public: return Imm.Val; } - enum ImmTy getImmTy() const { + ImmTy getImmTy() const { assert(isImm()); return Imm.Type; } @@ -501,9 +542,11 @@ public: return getModifiers().hasIntModifiers(); } + uint64_t applyInputFPModifiers(uint64_t Val, unsigned Size) const; + void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const; - void addLiteralImmOperand(MCInst &Inst, int64_t Val) const; + void addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const; template <unsigned Bitwidth> void addKImmFPOperands(MCInst &Inst, unsigned N) const; @@ -610,6 +653,10 @@ public: case ImmTyInterpSlot: OS << "InterpSlot"; break; case ImmTyInterpAttr: OS << "InterpAttr"; break; case ImmTyAttrChan: OS << "AttrChan"; break; + case ImmTyOpSel: OS << "OpSel"; break; + case ImmTyOpSelHi: OS << "OpSelHi"; break; + case ImmTyNegLo: OS << "NegLo"; break; + case ImmTyNegHi: OS << "NegHi"; break; } } @@ -636,7 +683,7 @@ public: static AMDGPUOperand::Ptr CreateImm(const AMDGPUAsmParser *AsmParser, int64_t Val, SMLoc Loc, - enum ImmTy Type = ImmTyNone, + ImmTy Type = ImmTyNone, bool IsFPImm = false) { auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser); Op->Imm.Val = Val; @@ -695,9 +742,9 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) { // Kernel scope begins at .amdgpu_hsa_kernel directive, ends at next // .amdgpu_hsa_kernel or at EOF. class KernelScopeInfo { - int SgprIndexUnusedMin; - int VgprIndexUnusedMin; - MCContext *Ctx; + int SgprIndexUnusedMin = -1; + int VgprIndexUnusedMin = -1; + MCContext *Ctx = nullptr; void usesSgprAt(int i) { if (i >= SgprIndexUnusedMin) { @@ -708,6 +755,7 @@ class KernelScopeInfo { } } } + void usesVgprAt(int i) { if (i >= VgprIndexUnusedMin) { VgprIndexUnusedMin = ++i; @@ -717,14 +765,16 @@ class KernelScopeInfo { } } } + public: - KernelScopeInfo() : SgprIndexUnusedMin(-1), VgprIndexUnusedMin(-1), Ctx(nullptr) - {} + KernelScopeInfo() = default; + void initialize(MCContext &Context) { Ctx = &Context; usesSgprAt(SgprIndexUnusedMin = -1); usesVgprAt(VgprIndexUnusedMin = -1); } + void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) { switch (RegKind) { case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break; @@ -738,9 +788,9 @@ class AMDGPUAsmParser : public MCTargetAsmParser { const MCInstrInfo &MII; MCAsmParser &Parser; - unsigned ForcedEncodingSize; - bool ForcedDPP; - bool ForcedSDWA; + unsigned ForcedEncodingSize = 0; + bool ForcedDPP = false; + bool ForcedSDWA = false; KernelScopeInfo KernelScope; /// @name Auto-generated Match Functions @@ -756,7 +806,7 @@ private: bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor); bool ParseDirectiveHSACodeObjectVersion(); bool ParseDirectiveHSACodeObjectISA(); - bool ParseDirectiveRuntimeMetadata(); + bool ParseDirectiveCodeObjectMetadata(); bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header); bool ParseDirectiveAMDKernelCodeT(); bool ParseSectionDirectiveHSAText(); @@ -767,44 +817,52 @@ private: bool ParseSectionDirectiveHSADataGlobalAgent(); bool ParseSectionDirectiveHSADataGlobalProgram(); bool ParseSectionDirectiveHSARodataReadonlyAgent(); - bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum); - bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex); - void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn); + bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, + RegisterKind RegKind, unsigned Reg1, + unsigned RegNum); + bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, + unsigned& RegNum, unsigned& RegWidth, + unsigned *DwordRegIndex); + void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, + bool IsAtomic, bool IsAtomicReturn); + void cvtDSImpl(MCInst &Inst, const OperandVector &Operands, + bool IsGdsHardcoded); public: enum AMDGPUMatchResultTy { Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY }; + typedef std::map<AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap; + AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser), - ForcedEncodingSize(0), - ForcedDPP(false), - ForcedSDWA(false) { + : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser) { MCAsmParserExtension::Initialize(Parser); - if (getSTI().getFeatureBits().none()) { + if (getFeatureBits().none()) { // Set default features. copySTI().ToggleFeature("SOUTHERN_ISLANDS"); } - setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits())); { // TODO: make those pre-defined variables read-only. // Currently there is none suitable machinery in the core llvm-mc for this. // MCSymbol::isRedefinable is intended for another purpose, and // AsmParser::parseDirectiveSet() cannot be specialized for specific target. - AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); MCContext &Ctx = getContext(); - MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); - Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx)); + MCSymbol *Sym = + Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor")); - Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx)); + Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); - Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx)); + Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); } KernelScope.initialize(getContext()); } @@ -822,7 +880,7 @@ public: } bool hasInv2PiInlineImm() const { - return getSTI().getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; + return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } bool hasSGPR102_SGPR103() const { @@ -844,6 +902,10 @@ public: return &MII; } + const FeatureBitset &getFeatureBits() const { + return getSTI().getFeatureBits(); + } + void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; } void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; } void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; } @@ -871,19 +933,28 @@ public: //bool ProcessInstruction(MCInst &Inst); OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int); + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, bool (*ConvertResult)(int64_t &) = nullptr); + + OperandMatchResultTy parseOperandArrayWithPrefix( + const char *Prefix, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + bool (*ConvertResult)(int64_t&) = nullptr); + OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value); - OperandMatchResultTy parseImm(OperandVector &Operands); + bool parseAbsoluteExpr(int64_t &Val, bool AbsMod = false); + OperandMatchResultTy parseImm(OperandVector &Operands, bool AbsMod = false); OperandMatchResultTy parseReg(OperandVector &Operands); - OperandMatchResultTy parseRegOrImm(OperandVector &Operands); + OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool AbsMod = false); OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true); OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true); OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands); @@ -891,7 +962,8 @@ public: OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands); void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); - void cvtDS(MCInst &Inst, const OperandVector &Operands); + void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); } + void cvtDSGds(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, true); } void cvtExp(MCInst &Inst, const OperandVector &Operands); bool parseCnt(int64_t &IntVal); @@ -911,6 +983,12 @@ private: void errorExpTgt(); OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val); + bool validateOperandLimitations(const MCInst &Inst); + bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); + bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; + unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + bool isSGPR(unsigned Reg); + public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -940,7 +1018,13 @@ public: void cvtId(MCInst &Inst, const OperandVector &Operands); void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands); + + void cvtVOP3Impl(MCInst &Inst, + const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3OMod(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); void cvtMIMG(MCInst &Inst, const OperandVector &Operands); void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); @@ -988,6 +1072,30 @@ static const fltSemantics *getFltSemantics(MVT VT) { return getFltSemantics(VT.getSizeInBits() / 8); } +static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { + switch (OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + return &APFloat::IEEEsingle(); + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + return &APFloat::IEEEdouble(); + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + return &APFloat::IEEEhalf(); + default: + llvm_unreachable("unsupported fp type"); + } +} + //===----------------------------------------------------------------------===// // Operand //===----------------------------------------------------------------------===// @@ -1031,13 +1139,18 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { if (!canLosslesslyConvertToFPType(FPLiteral, type)) return false; + if (type.getScalarSizeInBits() == 16) { + return AMDGPU::isInlinableLiteral16( + static_cast<int16_t>(FPLiteral.bitcastToAPInt().getZExtValue()), + AsmParser->hasInv2PiInlineImm()); + } + // Check if single precision literal is inlinable return AMDGPU::isInlinableLiteral32( static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()), AsmParser->hasInv2PiInlineImm()); } - // We got int literal token. if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand return AMDGPU::isInlinableLiteral64(Imm.Val, @@ -1064,6 +1177,13 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { if (!Imm.IsFPImm) { // We got int literal token. + if (type == MVT::f64 && hasFPModifiers()) { + // Cannot apply fp modifiers to int literals preserving the same semantics + // for VOP1/2/C and VOP3 because of integer truncation. To avoid ambiguity, + // disable these cases. + return false; + } + unsigned Size = type.getSizeInBits(); if (Size == 64) Size = 32; @@ -1093,40 +1213,57 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const { return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg()); } -void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const { - int64_t Val = Imm.Val; - if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers() && Imm.Mods.Neg) { - // Apply modifiers to immediate value. Only negate can get here - if (Imm.IsFPImm) { - APFloat F(BitsToDouble(Val)); - F.changeSign(); - Val = F.bitcastToAPInt().getZExtValue(); - } else { - Val = -Val; - } +uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const +{ + assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers()); + assert(Size == 2 || Size == 4 || Size == 8); + + const uint64_t FpSignMask = (1ULL << (Size * 8 - 1)); + + if (Imm.Mods.Abs) { + Val &= ~FpSignMask; } + if (Imm.Mods.Neg) { + Val ^= FpSignMask; + } + + return Val; +} + +void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const { if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()), Inst.getNumOperands())) { - addLiteralImmOperand(Inst, Val); + addLiteralImmOperand(Inst, Imm.Val, + ApplyModifiers & + isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers()); } else { - Inst.addOperand(MCOperand::createImm(Val)); + assert(!isImmTy(ImmTyNone) || !hasModifiers()); + Inst.addOperand(MCOperand::createImm(Imm.Val)); } } -void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { +void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const { const auto& InstDesc = AsmParser->getMII()->get(Inst.getOpcode()); auto OpNum = Inst.getNumOperands(); // Check that this operand accepts literals assert(AMDGPU::isSISrcOperand(InstDesc, OpNum)); - auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size + if (ApplyModifiers) { + assert(AMDGPU::isSISrcFPOperand(InstDesc, OpNum)); + const unsigned Size = Imm.IsFPImm ? sizeof(double) : getOperandSize(InstDesc, OpNum); + Val = applyInputFPModifiers(Val, Size); + } + + APInt Literal(64, Val); + uint8_t OpTy = InstDesc.OpInfo[OpNum].OperandType; if (Imm.IsFPImm) { // We got fp literal token - APInt Literal(64, Val); - - switch (OpSize) { - case 8: { + switch (OpTy) { + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: { if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Literal.getZExtValue())); @@ -1151,16 +1288,31 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { // in predicate methods (isLiteralImm()) llvm_unreachable("fp literal in 64-bit integer instruction."); } - case 4: - case 2: { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { bool lost; APFloat FPLiteral(APFloat::IEEEdouble(), Literal); // Convert literal to single precision - FPLiteral.convert(*getFltSemantics(OpSize), + FPLiteral.convert(*getOpFltSemantics(OpTy), APFloat::rmNearestTiesToEven, &lost); // We allow precision lost but not overflow or underflow. This should be // checked earlier in isLiteralImm() - Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue())); + + uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue(); + if (OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || + OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) { + ImmVal |= (ImmVal << 16); + } + + Inst.addOperand(MCOperand::createImm(ImmVal)); return; } default: @@ -1173,8 +1325,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { // We got int literal token. // Only sign extend inline immediates. // FIXME: No errors on truncation - switch (OpSize) { - case 4: { + switch (OpTy) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: { if (isInt<32>(Val) && AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val), AsmParser->hasInv2PiInlineImm())) { @@ -1185,9 +1340,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { Inst.addOperand(MCOperand::createImm(Val & 0xffffffff)); return; } - case 8: { - if (AMDGPU::isInlinableLiteral64(Val, - AsmParser->hasInv2PiInlineImm())) { + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: { + if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); return; } @@ -1195,7 +1352,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { Inst.addOperand(MCOperand::createImm(Lo_32(Val))); return; } - case 2: { + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Val) && AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val), AsmParser->hasInv2PiInlineImm())) { @@ -1206,6 +1366,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { Inst.addOperand(MCOperand::createImm(Val & 0xffff)); return; } + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + auto LiteralVal = static_cast<uint16_t>(Literal.getLoBits(16).getZExtValue()); + assert(AMDGPU::isInlinableLiteral16(LiteralVal, + AsmParser->hasInv2PiInlineImm())); + + uint32_t ImmVal = static_cast<uint32_t>(LiteralVal) << 16 | + static_cast<uint32_t>(LiteralVal); + Inst.addOperand(MCOperand::createImm(ImmVal)); + return; + } default: llvm_unreachable("invalid operand size"); } @@ -1289,7 +1460,8 @@ static unsigned getSpecialRegForName(StringRef RegName) { .Default(0); } -bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { +bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) { auto R = parseRegister(); if (!R) return true; assert(R->isReg()); @@ -1299,20 +1471,43 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End return false; } -bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum) -{ +bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, + RegisterKind RegKind, unsigned Reg1, + unsigned RegNum) { switch (RegKind) { case IS_SPECIAL: - if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; RegWidth = 2; return true; } - if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; RegWidth = 2; return true; } - if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; return true; } - if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; RegWidth = 2; return true; } - if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; RegWidth = 2; return true; } + if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { + Reg = AMDGPU::EXEC; + RegWidth = 2; + return true; + } + if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { + Reg = AMDGPU::FLAT_SCR; + RegWidth = 2; + return true; + } + if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { + Reg = AMDGPU::VCC; + RegWidth = 2; + return true; + } + if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { + Reg = AMDGPU::TBA; + RegWidth = 2; + return true; + } + if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { + Reg = AMDGPU::TMA; + RegWidth = 2; + return true; + } return false; case IS_VGPR: case IS_SGPR: case IS_TTMP: - if (Reg1 != Reg + RegWidth) { return false; } + if (Reg1 != Reg + RegWidth) { + return false; + } RegWidth++; return true; default: @@ -1320,8 +1515,9 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, R } } -bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex) -{ +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, + unsigned &RegNum, unsigned &RegWidth, + unsigned *DwordRegIndex) { if (DwordRegIndex) { *DwordRegIndex = 0; } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); if (getLexer().is(AsmToken::Identifier)) { @@ -1462,8 +1658,33 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false); } +bool +AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) { + if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) && + (getLexer().getKind() == AsmToken::Integer || + getLexer().getKind() == AsmToken::Real)) { + + // This is a workaround for handling operands like these: + // |1.0| + // |-1| + // This syntax is not compatible with syntax of standard + // MC expressions (due to the trailing '|'). + + SMLoc EndLoc; + const MCExpr *Expr; + + if (getParser().parsePrimaryExpr(Expr, EndLoc)) { + return true; + } + + return !Expr->evaluateAsAbsolute(Val); + } + + return getParser().parseAbsoluteExpression(Val); +} + OperandMatchResultTy -AMDGPUAsmParser::parseImm(OperandVector &Operands) { +AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) { // TODO: add syntactic sugar for 1/(2*PI) bool Minus = false; if (getLexer().getKind() == AsmToken::Minus) { @@ -1475,7 +1696,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) { switch(getLexer().getKind()) { case AsmToken::Integer: { int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) + if (parseAbsoluteExpr(IntVal, AbsMod)) return MatchOperand_ParseFail; if (Minus) IntVal *= -1; @@ -1484,7 +1705,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) { } case AsmToken::Real: { int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) + if (parseAbsoluteExpr(IntVal, AbsMod)) return MatchOperand_ParseFail; APFloat F(BitsToDouble(IntVal)); @@ -1512,8 +1733,8 @@ AMDGPUAsmParser::parseReg(OperandVector &Operands) { } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) { - auto res = parseImm(Operands); +AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool AbsMod) { + auto res = parseImm(Operands, AbsMod); if (res != MatchOperand_NoMatch) { return res; } @@ -1522,18 +1743,50 @@ AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) { } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) { - // XXX: During parsing we can't determine if minus sign means - // negate-modifier or negative immediate value. - // By default we suppose it is modifier. - bool Negate = false, Abs = false, Abs2 = false; +AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, + bool AllowImm) { + bool Negate = false, Negate2 = false, Abs = false, Abs2 = false; if (getLexer().getKind()== AsmToken::Minus) { + const AsmToken NextToken = getLexer().peekTok(); + + // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead. + if (NextToken.is(AsmToken::Minus)) { + Error(Parser.getTok().getLoc(), "invalid syntax, expected 'neg' modifier"); + return MatchOperand_ParseFail; + } + + // '-' followed by an integer literal N should be interpreted as integer + // negation rather than a floating-point NEG modifier applied to N. + // Beside being contr-intuitive, such use of floating-point NEG modifier + // results in different meaning of integer literals used with VOP1/2/C + // and VOP3, for example: + // v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF + // v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001 + // Negative fp literals should be handled likewise for unifomtity + if (!NextToken.is(AsmToken::Integer) && !NextToken.is(AsmToken::Real)) { + Parser.Lex(); + Negate = true; + } + } + + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == "neg") { + if (Negate) { + Error(Parser.getTok().getLoc(), "expected register or immediate"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + Negate2 = true; + if (getLexer().isNot(AsmToken::LParen)) { + Error(Parser.getTok().getLoc(), "expected left paren after neg"); + return MatchOperand_ParseFail; + } Parser.Lex(); - Negate = true; } - if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "abs") { + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == "abs") { Parser.Lex(); Abs2 = true; if (getLexer().isNot(AsmToken::LParen)) { @@ -1554,7 +1807,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo OperandMatchResultTy Res; if (AllowImm) { - Res = parseRegOrImm(Operands); + Res = parseRegOrImm(Operands, Abs); } else { Res = parseReg(Operands); } @@ -1563,9 +1816,6 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo } AMDGPUOperand::Modifiers Mods; - if (Negate) { - Mods.Neg = true; - } if (Abs) { if (getLexer().getKind() != AsmToken::Pipe) { Error(Parser.getTok().getLoc(), "expected vertical bar"); @@ -1583,6 +1833,17 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo Mods.Abs = true; } + if (Negate) { + Mods.Neg = true; + } else if (Negate2) { + if (getLexer().isNot(AsmToken::RParen)) { + Error(Parser.getTok().getLoc(), "expected closing parentheses"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + Mods.Neg = true; + } + if (Mods.hasFPModifiers()) { AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); Op.setModifiers(Mods); @@ -1591,10 +1852,12 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo } OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) { +AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, + bool AllowImm) { bool Sext = false; - if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") { + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == "sext") { Parser.Lex(); Sext = true; if (getLexer().isNot(AsmToken::LParen)) { @@ -1661,7 +1924,6 @@ OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) } unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { - uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || @@ -1719,6 +1981,128 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { return makeArrayRef(Variants); } +unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + const unsigned Num = Desc.getNumImplicitUses(); + for (unsigned i = 0; i < Num; ++i) { + unsigned Reg = Desc.ImplicitUses[i]; + switch (Reg) { + case AMDGPU::FLAT_SCR: + case AMDGPU::VCC: + case AMDGPU::M0: + return Reg; + default: + break; + } + } + return AMDGPU::NoRegister; +} + +bool AMDGPUAsmParser::isSGPR(unsigned Reg) { + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); + const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); + return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || + Reg == AMDGPU::SCC; +} + +// NB: This code is correct only when used to check constant +// bus limitations because GFX7 support no f16 inline constants. +// Note that there are no cases when a GFX7 opcode violates +// constant bus limitations due to the use of an f16 constant. +bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, + unsigned OpIdx) const { + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + + if (!AMDGPU::isSISrcOperand(Desc, OpIdx)) { + return false; + } + + const MCOperand &MO = Inst.getOperand(OpIdx); + + int64_t Val = MO.getImm(); + auto OpSize = AMDGPU::getOperandSize(Desc, OpIdx); + + switch (OpSize) { // expected operand size + case 8: + return AMDGPU::isInlinableLiteral64(Val, hasInv2PiInlineImm()); + case 4: + return AMDGPU::isInlinableLiteral32(Val, hasInv2PiInlineImm()); + case 2: { + const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType; + if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) { + return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm()); + } else { + return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); + } + } + default: + llvm_unreachable("invalid operand size"); + } +} + +bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { + const MCOperand &MO = Inst.getOperand(OpIdx); + if (MO.isImm()) { + return !isInlineConstant(Inst, OpIdx); + } + return !MO.isReg() || isSGPR(mc2PseudoReg(MO.getReg())); +} + +bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) { + const unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opcode); + unsigned ConstantBusUseCount = 0; + + if (Desc.TSFlags & + (SIInstrFlags::VOPC | + SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | + SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) { + + // Check special imm operands (used by madmk, etc) + if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) { + ++ConstantBusUseCount; + } + + unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst); + if (SGPRUsed != AMDGPU::NoRegister) { + ++ConstantBusUseCount; + } + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + + for (int OpIdx : OpIndices) { + if (OpIdx == -1) break; + + const MCOperand &MO = Inst.getOperand(OpIdx); + if (usesConstantBus(Inst, OpIdx)) { + if (MO.isReg()) { + const unsigned Reg = mc2PseudoReg(MO.getReg()); + // Pairs of registers with a partial intersections like these + // s0, s[0:1] + // flat_scratch_lo, flat_scratch + // flat_scratch_lo, flat_scratch_hi + // are theoretically valid but they are disabled anyway. + // Note that this code mimics SIInstrInfo::verifyInstruction + if (Reg != SGPRUsed) { + ++ConstantBusUseCount; + } + SGPRUsed = Reg; + } else { // Expression or a literal + ++ConstantBusUseCount; + } + } + } + } + + return ConstantBusUseCount <= 1; +} + bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -1751,6 +2135,10 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (Result) { default: break; case Match_Success: + if (!validateOperandLimitations(Inst)) { + return Error(IDLoc, + "invalid operand (violates constant bus restrictions)"); + } Inst.setLoc(IDLoc); Out.EmitInstruction(Inst, getSTI()); return false; @@ -1793,7 +2181,6 @@ bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) { return false; } - bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor) { if (ParseAsAbsoluteExpression(Major)) @@ -1810,7 +2197,6 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major, } bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() { - uint32_t Major; uint32_t Minor; @@ -1831,9 +2217,10 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { // If this directive has no arguments, then use the ISA version for the // targeted GPU. if (getLexer().is(AsmToken::EndOfStatement)) { - AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); - getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor, - Isa.Stepping, + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); + getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, + ISA.Stepping, "AMD", "AMDGPU"); return false; } @@ -1873,42 +2260,45 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { return false; } -bool AMDGPUAsmParser::ParseDirectiveRuntimeMetadata() { - std::string Metadata; - raw_string_ostream MS(Metadata); +bool AMDGPUAsmParser::ParseDirectiveCodeObjectMetadata() { + std::string YamlString; + raw_string_ostream YamlStream(YamlString); getLexer().setSkipSpace(false); bool FoundEnd = false; while (!getLexer().is(AsmToken::Eof)) { while (getLexer().is(AsmToken::Space)) { - MS << ' '; + YamlStream << getLexer().getTok().getString(); Lex(); } if (getLexer().is(AsmToken::Identifier)) { StringRef ID = getLexer().getTok().getIdentifier(); - if (ID == ".end_amdgpu_runtime_metadata") { + if (ID == AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd) { Lex(); FoundEnd = true; break; } } - MS << Parser.parseStringToEndOfStatement() - << getContext().getAsmInfo()->getSeparatorString(); + YamlStream << Parser.parseStringToEndOfStatement() + << getContext().getAsmInfo()->getSeparatorString(); Parser.eatToEndOfStatement(); } getLexer().setSkipSpace(true); - if (getLexer().is(AsmToken::Eof) && !FoundEnd) - return TokError("expected directive .end_amdgpu_runtime_metadata not found"); + if (getLexer().is(AsmToken::Eof) && !FoundEnd) { + return TokError( + "expected directive .end_amdgpu_code_object_metadata not found"); + } - MS.flush(); + YamlStream.flush(); - getTargetStreamer().EmitRuntimeMetadata(Metadata); + if (!getTargetStreamer().EmitCodeObjectMetadata(YamlString)) + return Error(getParser().getTok().getLoc(), "invalid code object metadata"); return false; } @@ -1926,7 +2316,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { amd_kernel_code_t Header; - AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits()); + AMDGPU::initDefaultAMDKernelCodeT(Header, getFeatureBits()); while (true) { // Lex EndOfStatement. This is in a while loop, because lexing a comment @@ -2020,8 +2410,8 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".hsa_code_object_isa") return ParseDirectiveHSACodeObjectISA(); - if (IDVal == ".amdgpu_runtime_metadata") - return ParseDirectiveRuntimeMetadata(); + if (IDVal == AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin) + return ParseDirectiveCodeObjectMetadata(); if (IDVal == ".amd_kernel_code_t") return ParseDirectiveAMDKernelCodeT(); @@ -2080,7 +2470,6 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, OperandMatchResultTy AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { - // Try to parse with a custom parser OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); @@ -2208,7 +2597,7 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) { OperandMatchResultTy AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy, + AMDGPUOperand::ImmTy ImmTy, bool (*ConvertResult)(int64_t&)) { SMLoc S = Parser.getTok().getLoc(); int64_t Value = 0; @@ -2225,9 +2614,59 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, return MatchOperand_Success; } +OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix( + const char *Prefix, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy, + bool (*ConvertResult)(int64_t&)) { + StringRef Name = Parser.getTok().getString(); + if (!Name.equals(Prefix)) + return MatchOperand_NoMatch; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return MatchOperand_ParseFail; + Parser.Lex(); + + unsigned Val = 0; + SMLoc S = Parser.getTok().getLoc(); + + // FIXME: How to verify the number of elements matches the number of src + // operands? + for (int I = 0; I < 3; ++I) { + if (I != 0) { + if (getLexer().is(AsmToken::RBrac)) + break; + + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + } + + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + + int64_t Op; + if (getParser().parseAbsoluteExpression(Op)) + return MatchOperand_ParseFail; + + if (Op != 0 && Op != 1) + return MatchOperand_ParseFail; + Val |= (Op << I); + } + + Parser.Lex(); + Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy)); + return MatchOperand_Success; +} + OperandMatchResultTy AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy) { + AMDGPUOperand::ImmTy ImmTy) { int64_t Bit = 0; SMLoc S = Parser.getTok().getLoc(); @@ -2257,11 +2696,11 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, return MatchOperand_Success; } -typedef std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap; - -void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands, - OptionalImmIndexMap& OptionalIdx, - enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) { +static void addOptionalImmOperand( + MCInst& Inst, const OperandVector& Operands, + AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx, + AMDGPUOperand::ImmTy ImmT, + int64_t Default = 0) { auto i = OptionalIdx.find(ImmT); if (i != OptionalIdx.end()) { unsigned Idx = i->second; @@ -2323,9 +2762,9 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 } -void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { - std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; - bool GDSOnly = false; +void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, + bool IsGdsHardcoded) { + OptionalImmIndexMap OptionalIdx; for (unsigned i = 1, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); @@ -2337,7 +2776,7 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { } if (Op.isToken() && Op.getToken() == "gds") { - GDSOnly = true; + IsGdsHardcoded = true; continue; } @@ -2346,9 +2785,7 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); - - if (!GDSOnly) { + if (!IsGdsHardcoded) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); } Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 @@ -2421,13 +2858,14 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) Parser.Lex(); - IsaVersion IV = getIsaVersion(getSTI().getFeatureBits()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); if (CntName == "vmcnt") - IntVal = encodeVmcnt(IV, IntVal, CntVal); + IntVal = encodeVmcnt(ISA, IntVal, CntVal); else if (CntName == "expcnt") - IntVal = encodeExpcnt(IV, IntVal, CntVal); + IntVal = encodeExpcnt(ISA, IntVal, CntVal); else if (CntName == "lgkmcnt") - IntVal = encodeLgkmcnt(IV, IntVal, CntVal); + IntVal = encodeLgkmcnt(ISA, IntVal, CntVal); else return true; @@ -2436,8 +2874,9 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { OperandMatchResultTy AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { - IsaVersion IV = getIsaVersion(getSTI().getFeatureBits()); - int64_t Waitcnt = getWaitcntBitMask(IV); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); + int64_t Waitcnt = getWaitcntBitMask(ISA); SMLoc S = Parser.getTok().getLoc(); switch(getLexer().getKind()) { @@ -2459,7 +2898,8 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { return MatchOperand_Success; } -bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) { +bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, + int64_t &Width) { using namespace llvm::AMDGPU::Hwreg; if (Parser.getTok().getString() != "hwreg") @@ -2520,8 +2960,7 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, return false; } -OperandMatchResultTy -AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { +OperandMatchResultTy AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { using namespace llvm::AMDGPU::Hwreg; int64_t Imm16Val = 0; @@ -3170,6 +3609,10 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr}, {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr}, + {"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr}, + {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr}, + {"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr}, + {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr} }; OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { @@ -3186,6 +3629,12 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operan res = parseSDWASel(Operands, Op.Name, Op.Type); } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) { res = parseSDWADstUnused(Operands); + } else if (Op.Type == AMDGPUOperand::ImmTyOpSel || + Op.Type == AMDGPUOperand::ImmTyOpSelHi || + Op.Type == AMDGPUOperand::ImmTyNegLo || + Op.Type == AMDGPUOperand::ImmTyNegHi) { + res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type, + Op.ConvertResult); } else { res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); } @@ -3241,8 +3690,8 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1; } -void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { - OptionalImmIndexMap OptionalIdx; +void AMDGPUAsmParser::cvtVOP3Impl(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx) { unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { @@ -3253,12 +3702,20 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegOrImmWithFPInputModsOperands(Inst, 2); - } else if (Op.isImm()) { + } else if (Op.isImmModifier()) { OptionalIdx[Op.getImmTy()] = I; + } else if (Op.isRegOrImm()) { + Op.addRegOrImmOperands(Inst, 1); } else { llvm_unreachable("unhandled operand type"); } } +} + +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + + cvtVOP3Impl(Inst, Operands, OptionalIdx); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); @@ -3283,6 +3740,96 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { } } +void AMDGPUAsmParser::cvtVOP3OMod(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + if (Op.isMod()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + Op.addRegOrImmOperands(Inst, 1); + } + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); +} + +void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptIdx; + + cvtVOP3Impl(Inst, Operands, OptIdx); + + // FIXME: This is messy. Parse the modifiers as if it was a normal VOP3 + // instruction, and then figure out where to actually put the modifiers + int Opc = Inst.getOpcode(); + + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI); + } + + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel); + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, -1); + + int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); + if (NegLoIdx != -1) { + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi); + } + + const int Ops[] = { AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 }; + const int ModOps[] = { AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers }; + + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); + unsigned NegLo = 0; + unsigned NegHi = 0; + + if (NegLoIdx != -1) { + int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); + NegLo = Inst.getOperand(NegLoIdx).getImm(); + NegHi = Inst.getOperand(NegHiIdx).getImm(); + } + + for (int J = 0; J < 3; ++J) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); + if (OpIdx == -1) + break; + + uint32_t ModVal = 0; + + if ((OpSel & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_0; + + if ((OpSelHi & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_1; + + if ((NegLo & (1 << J)) != 0) + ModVal |= SISrcMods::NEG; + + if ((NegHi & (1 << J)) != 0) + ModVal |= SISrcMods::NEG_HI; + + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + + Inst.getOperand(ModIdx).setImm(ModVal); + } +} + //===----------------------------------------------------------------------===// // dpp //===----------------------------------------------------------------------===// @@ -3436,7 +3983,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { - // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token. + // VOP2b (v_add_u32, v_sub_u32 ...) dpp use "vcc" token. // Skip it. continue; } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { @@ -3547,6 +4094,7 @@ void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, uint64_t BasicInstType) { + using namespace llvm::AMDGPU::SDWA; OptionalImmIndexMap OptionalIdx; unsigned I = 1; @@ -3581,21 +4129,21 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, // V_NOP_sdwa_vi has no optional sdwa arguments switch (BasicInstType) { case SIInstrFlags::VOP1: - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); break; case SIInstrFlags::VOP2: - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; case SIInstrFlags::VOPC: - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; default: diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 45a7fe6d3439..a6609f0725ab 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -21,8 +21,8 @@ def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset" class MubufLoad <SDPatternOperator op> : PatFrag < (ops node:$ptr), (op node:$ptr), [{ auto const AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; + return AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS; }]>; def mubuf_load : MubufLoad <load>; @@ -705,12 +705,6 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", let Predicates = [isGCN] in { -// int_SI_vs_load_input -def : Pat< - (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0) ->; - // Offset in an 32-bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 02d441756c85..7c0ef4aeac3c 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -12,11 +12,17 @@ tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering) +if(LLVM_BUILD_GLOBAL_ISEL) + tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank) +endif() add_public_tablegen_target(AMDGPUCommonTableGen) # List of all GlobalISel files. set(GLOBAL_ISEL_FILES AMDGPUCallLowering.cpp + AMDGPUInstructionSelector.cpp + AMDGPULegalizerInfo.cpp + AMDGPURegisterBankInfo.cpp ) # Add GlobalISel files to the dependencies if the user wants to build it. @@ -30,6 +36,7 @@ endif() add_llvm_target(AMDGPUCodeGen AMDILCFGStructurizer.cpp + AMDGPUAliasAnalysis.cpp AMDGPUAlwaysInlinePass.cpp AMDGPUAnnotateKernelFeatures.cpp AMDGPUAnnotateUniformValues.cpp @@ -39,6 +46,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUTargetObjectFile.cpp AMDGPUIntrinsicInfo.cpp AMDGPUISelDAGToDAG.cpp + AMDGPULowerIntrinsics.cpp AMDGPUMCInstLower.cpp AMDGPUMachineFunction.cpp AMDGPUUnifyMetadata.cpp @@ -50,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUInstrInfo.cpp AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp + AMDGPUUnifyDivergentExitNodes.cpp GCNHazardRecognizer.cpp GCNSchedStrategy.cpp R600ClauseMergePass.cpp @@ -68,10 +77,12 @@ add_llvm_target(AMDGPUCodeGen SIDebuggerInsertNops.cpp SIFixControlFlowLiveIntervals.cpp SIFixSGPRCopies.cpp + SIFixVGPRCopies.cpp SIFoldOperands.cpp SIFrameLowering.cpp SIInsertSkips.cpp SIInsertWaits.cpp + SIInsertWaitcnts.cpp SIInstrInfo.cpp SIISelLowering.cpp SILoadStoreOptimizer.cpp @@ -80,10 +91,14 @@ add_llvm_target(AMDGPUCodeGen SIMachineFunctionInfo.cpp SIMachineScheduler.cpp SIOptimizeExecMasking.cpp + SIPeepholeSDWA.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp SIWholeQuadMode.cpp + GCNIterativeScheduler.cpp + GCNMinRegStrategy.cpp + GCNRegPressure.cpp ${GLOBAL_ISEL_BUILD_FILES} ) diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index a077001df6bd..a9f64589fa5e 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -88,18 +88,6 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32> let has_vdst = 0; } -class DS_1A_Off8_NORET<string opName> : DS_Pseudo<opName, - (outs), - (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds), - "$addr $offset0$offset1$gds"> { - - let has_data0 = 0; - let has_data1 = 0; - let has_vdst = 0; - let has_offset = 0; - let AsmMatchConverter = "cvtDSOffset01"; -} - class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs), @@ -143,6 +131,20 @@ class DS_1A2D_RET<string opName, let hasPostISelHook = 1; } +class DS_1A2D_Off8_RET<string opName, + RegisterClass rc = VGPR_32, + RegisterClass src = rc> +: DS_Pseudo<opName, + (outs rc:$vdst), + (ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds), + "$vdst, $addr, $data0, $data1$offset0$offset1$gds"> { + + let has_offset = 0; + let AsmMatchConverter = "cvtDSOffset01"; + + let hasPostISelHook = 1; +} + class DS_1A_RET<string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs rc:$vdst), @@ -174,6 +176,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, let has_data1 = 0; let has_gds = 0; let gdsValue = 1; + let AsmMatchConverter = "cvtDSGds"; } class DS_0A_RET <string opName> : DS_Pseudo<opName, @@ -202,20 +205,46 @@ class DS_1A <string opName> : DS_Pseudo<opName, let has_data1 = 0; } -class DS_1A_GDS <string opName> : DS_Pseudo<opName, - (outs), - (ins VGPR_32:$addr), - "$addr gds"> { +class DS_GWS <string opName, dag ins, string asmOps> +: DS_Pseudo<opName, (outs), ins, asmOps> { + + let has_vdst = 0; + let has_addr = 0; + let has_data0 = 0; + let has_data1 = 0; + + let has_gds = 0; + let gdsValue = 1; + let AsmMatchConverter = "cvtDSGds"; +} + +class DS_GWS_0D <string opName> +: DS_GWS<opName, + (ins offset:$offset, gds:$gds), "$offset gds">; - let has_vdst = 0; - let has_data0 = 0; - let has_data1 = 0; - let has_offset = 0; +class DS_GWS_1D <string opName> +: DS_GWS<opName, + (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> { + + let has_data0 = 1; +} + +class DS_VOID <string opName> : DS_Pseudo<opName, + (outs), (ins), ""> { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 1; + let UseNamedOperandTable = 0; + let AsmMatchConverter = ""; + + let has_vdst = 0; + let has_addr = 0; + let has_data0 = 0; + let has_data1 = 0; + let has_offset = 0; let has_offset0 = 0; let has_offset1 = 0; - - let has_gds = 0; - let gdsValue = 1; + let has_gds = 0; } class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag> @@ -226,6 +255,8 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag> [(set i32:$vdst, (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > { + let LGKM_CNT = 0; + let mayLoad = 0; let mayStore = 0; let isConvergent = 1; @@ -324,9 +355,9 @@ def DS_MAX_RTN_F32 : DS_1A1D_RET <"ds_max_rtn_f32">, def DS_WRXCHG_RTN_B32 : DS_1A1D_RET<"ds_wrxchg_rtn_b32">, AtomicNoRet<"", 1>; -def DS_WRXCHG2_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>, +def DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>, AtomicNoRet<"", 1>; -def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>, +def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>, AtomicNoRet<"", 1>; def DS_ADD_RTN_U64 : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>, @@ -365,17 +396,17 @@ def DS_MAX_RTN_F64 : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>, AtomicNoRet<"ds_max_f64", 1>; def DS_WRXCHG_RTN_B64 : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>, - AtomicNoRet<"ds_wrxchg_b64", 1>; -def DS_WRXCHG2_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>, - AtomicNoRet<"ds_wrxchg2_b64", 1>; -def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>, - AtomicNoRet<"ds_wrxchg2st64_b64", 1>; - -def DS_GWS_INIT : DS_1A_GDS<"ds_gws_init">; -def DS_GWS_SEMA_V : DS_1A_GDS<"ds_gws_sema_v">; -def DS_GWS_SEMA_BR : DS_1A_GDS<"ds_gws_sema_br">; -def DS_GWS_SEMA_P : DS_1A_GDS<"ds_gws_sema_p">; -def DS_GWS_BARRIER : DS_1A_GDS<"ds_gws_barrier">; + AtomicNoRet<"", 1>; +def DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>, + AtomicNoRet<"", 1>; +def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>, + AtomicNoRet<"", 1>; + +def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">; +def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">; +def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">; +def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">; +def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">; def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">; def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">; @@ -386,7 +417,7 @@ def DS_MIN_SRC2_I32 : DS_1A<"ds_min_src2_i32">; def DS_MAX_SRC2_I32 : DS_1A<"ds_max_src2_i32">; def DS_MIN_SRC2_U32 : DS_1A<"ds_min_src2_u32">; def DS_MAX_SRC2_U32 : DS_1A<"ds_max_src2_u32">; -def DS_AND_SRC2_B32 : DS_1A<"ds_and_src_b32">; +def DS_AND_SRC2_B32 : DS_1A<"ds_and_src2_b32">; def DS_OR_SRC2_B32 : DS_1A<"ds_or_src2_b32">; def DS_XOR_SRC2_B32 : DS_1A<"ds_xor_src2_b32">; def DS_MIN_SRC2_F32 : DS_1A<"ds_min_src2_f32">; @@ -407,8 +438,8 @@ def DS_XOR_SRC2_B64 : DS_1A<"ds_xor_src2_b64">; def DS_MIN_SRC2_F64 : DS_1A<"ds_min_src2_f64">; def DS_MAX_SRC2_F64 : DS_1A<"ds_max_src2_f64">; -def DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET<"ds_write_src2_b32">; -def DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET<"ds_write_src2_b64">; +def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">; +def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">; @@ -429,30 +460,34 @@ def DS_READ2_B64 : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>; def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>; } -let SubtargetPredicate = isSICI in { def DS_CONSUME : DS_0A_RET<"ds_consume">; def DS_APPEND : DS_0A_RET<"ds_append">; def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; -} //===----------------------------------------------------------------------===// // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// -// Remaining instructions: -// DS_NOP -// DS_GWS_SEMA_RELEASE_ALL -// DS_WRAP_RTN_B32 -// DS_CNDXCHG32_RTN_B64 -// DS_WRITE_B96 -// DS_WRITE_B128 -// DS_CONDXCHG32_RTN_B128 -// DS_READ_B96 -// DS_READ_B128 let SubtargetPredicate = isCIVI in { -def DS_WRAP_RTN_F32 : DS_1A1D_RET <"ds_wrap_rtn_f32">, - AtomicNoRet<"ds_wrap_f32", 1>; +def DS_WRAP_RTN_B32 : DS_1A2D_RET<"ds_wrap_rtn_b32">, AtomicNoRet<"", 1>; + +def DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET<"ds_condxchg32_rtn_b64", VReg_64>, + AtomicNoRet<"", 1>; + +def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">; + +let mayStore = 0 in { +def DS_READ_B96 : DS_1A_RET<"ds_read_b96", VReg_96>; +def DS_READ_B128: DS_1A_RET<"ds_read_b128", VReg_128>; +} // End mayStore = 0 + +let mayLoad = 0 in { +def DS_WRITE_B96 : DS_1A1D_NORET<"ds_write_b96", VReg_96>; +def DS_WRITE_B128 : DS_1A1D_NORET<"ds_write_b128", VReg_128>; +} // End mayLoad = 0 + +def DS_NOP : DS_VOID<"ds_nop">; } // let SubtargetPredicate = isCIVI @@ -623,6 +658,7 @@ def DS_CMPST_B32_si : DS_Real_si<0x10, DS_CMPST_B32>; def DS_CMPST_F32_si : DS_Real_si<0x11, DS_CMPST_F32>; def DS_MIN_F32_si : DS_Real_si<0x12, DS_MIN_F32>; def DS_MAX_F32_si : DS_Real_si<0x13, DS_MAX_F32>; +def DS_NOP_si : DS_Real_si<0x14, DS_NOP>; def DS_GWS_INIT_si : DS_Real_si<0x19, DS_GWS_INIT>; def DS_GWS_SEMA_V_si : DS_Real_si<0x1a, DS_GWS_SEMA_V>; def DS_GWS_SEMA_BR_si : DS_Real_si<0x1b, DS_GWS_SEMA_BR>; @@ -651,8 +687,10 @@ def DS_CMPST_RTN_F32_si : DS_Real_si<0x31, DS_CMPST_RTN_F32>; def DS_MIN_RTN_F32_si : DS_Real_si<0x32, DS_MIN_RTN_F32>; def DS_MAX_RTN_F32_si : DS_Real_si<0x33, DS_MAX_RTN_F32>; -// FIXME: this instruction is actually CI/VI -def DS_WRAP_RTN_F32_si : DS_Real_si<0x34, DS_WRAP_RTN_F32>; +// These instruction are CI/VI only +def DS_WRAP_RTN_B32_si : DS_Real_si<0x34, DS_WRAP_RTN_B32>; +def DS_CONDXCHG32_RTN_B64_si : DS_Real_si<0x7e, DS_CONDXCHG32_RTN_B64>; +def DS_GWS_SEMA_RELEASE_ALL_si : DS_Real_si<0x18, DS_GWS_SEMA_RELEASE_ALL>; def DS_SWIZZLE_B32_si : DS_Real_si<0x35, DS_SWIZZLE_B32>; def DS_READ_B32_si : DS_Real_si<0x36, DS_READ_B32>; @@ -744,6 +782,10 @@ def DS_WRITE_SRC2_B64_si : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>; def DS_MIN_SRC2_F64_si : DS_Real_si<0xd2, DS_MIN_SRC2_F64>; def DS_MAX_SRC2_F64_si : DS_Real_si<0xd3, DS_MAX_SRC2_F64>; +def DS_WRITE_B96_si : DS_Real_si<0xde, DS_WRITE_B96>; +def DS_WRITE_B128_si : DS_Real_si<0xdf, DS_WRITE_B128>; +def DS_READ_B96_si : DS_Real_si<0xfe, DS_READ_B96>; +def DS_READ_B128_si : DS_Real_si<0xff, DS_READ_B128>; //===----------------------------------------------------------------------===// // VIInstructions.td @@ -787,12 +829,13 @@ def DS_CMPST_B32_vi : DS_Real_vi<0x10, DS_CMPST_B32>; def DS_CMPST_F32_vi : DS_Real_vi<0x11, DS_CMPST_F32>; def DS_MIN_F32_vi : DS_Real_vi<0x12, DS_MIN_F32>; def DS_MAX_F32_vi : DS_Real_vi<0x13, DS_MAX_F32>; +def DS_NOP_vi : DS_Real_vi<0x14, DS_NOP>; def DS_ADD_F32_vi : DS_Real_vi<0x15, DS_ADD_F32>; -def DS_GWS_INIT_vi : DS_Real_vi<0x19, DS_GWS_INIT>; -def DS_GWS_SEMA_V_vi : DS_Real_vi<0x1a, DS_GWS_SEMA_V>; -def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x1b, DS_GWS_SEMA_BR>; -def DS_GWS_SEMA_P_vi : DS_Real_vi<0x1c, DS_GWS_SEMA_P>; -def DS_GWS_BARRIER_vi : DS_Real_vi<0x1d, DS_GWS_BARRIER>; +def DS_GWS_INIT_vi : DS_Real_vi<0x99, DS_GWS_INIT>; +def DS_GWS_SEMA_V_vi : DS_Real_vi<0x9a, DS_GWS_SEMA_V>; +def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x9b, DS_GWS_SEMA_BR>; +def DS_GWS_SEMA_P_vi : DS_Real_vi<0x9c, DS_GWS_SEMA_P>; +def DS_GWS_BARRIER_vi : DS_Real_vi<0x9d, DS_GWS_BARRIER>; def DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>; def DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>; def DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>; @@ -815,7 +858,7 @@ def DS_CMPST_RTN_B32_vi : DS_Real_vi<0x30, DS_CMPST_RTN_B32>; def DS_CMPST_RTN_F32_vi : DS_Real_vi<0x31, DS_CMPST_RTN_F32>; def DS_MIN_RTN_F32_vi : DS_Real_vi<0x32, DS_MIN_RTN_F32>; def DS_MAX_RTN_F32_vi : DS_Real_vi<0x33, DS_MAX_RTN_F32>; -def DS_WRAP_RTN_F32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_F32>; +def DS_WRAP_RTN_B32_vi : DS_Real_vi<0x34, DS_WRAP_RTN_B32>; def DS_ADD_RTN_F32_vi : DS_Real_vi<0x35, DS_ADD_RTN_F32>; def DS_READ_B32_vi : DS_Real_vi<0x36, DS_READ_B32>; def DS_READ2_B32_vi : DS_Real_vi<0x37, DS_READ2_B32>; @@ -824,6 +867,9 @@ def DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>; def DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>; def DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>; def DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>; +def DS_CONSUME_vi : DS_Real_vi<0xbd, DS_CONSUME>; +def DS_APPEND_vi : DS_Real_vi<0xbe, DS_APPEND>; +def DS_ORDERED_COUNT_vi : DS_Real_vi<0xbf, DS_ORDERED_COUNT>; def DS_SWIZZLE_B32_vi : DS_Real_vi<0x3d, DS_SWIZZLE_B32>; def DS_PERMUTE_B32_vi : DS_Real_vi<0x3e, DS_PERMUTE_B32>; def DS_BPERMUTE_B32_vi : DS_Real_vi<0x3f, DS_BPERMUTE_B32>; @@ -865,6 +911,8 @@ def DS_MSKOR_RTN_B64_vi : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>; def DS_WRXCHG_RTN_B64_vi : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>; def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>; def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>; +def DS_CONDXCHG32_RTN_B64_vi : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>; +def DS_GWS_SEMA_RELEASE_ALL_vi : DS_Real_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>; def DS_CMPST_RTN_B64_vi : DS_Real_vi<0x70, DS_CMPST_RTN_B64>; def DS_CMPST_RTN_F64_vi : DS_Real_vi<0x71, DS_CMPST_RTN_F64>; def DS_MIN_RTN_F64_vi : DS_Real_vi<0x72, DS_MIN_RTN_F64>; @@ -904,3 +952,7 @@ def DS_XOR_SRC2_B64_vi : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>; def DS_WRITE_SRC2_B64_vi : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>; def DS_MIN_SRC2_F64_vi : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>; def DS_MAX_SRC2_F64_vi : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>; +def DS_WRITE_B96_vi : DS_Real_vi<0xde, DS_WRITE_B96>; +def DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>; +def DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>; +def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>; diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 2247cad7bb51..4fb03b62bba9 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -22,6 +22,7 @@ #include "AMDGPURegisterInfo.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixedLenDisassembler.h" @@ -97,9 +98,13 @@ static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); } -#define GET_SUBTARGETINFO_ENUM -#include "AMDGPUGenSubtargetInfo.inc" -#undef GET_SUBTARGETINFO_ENUM +static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); +} #include "AMDGPUGenDisassemblerTables.inc" @@ -138,7 +143,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, CommentStream = &CS; // ToDo: AMDGPUDisassembler supports only VI ISA. - assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA."); + if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]) + report_fatal_error("Disassembly not yet supported for subtarget"); const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size()); Bytes = Bytes_.slice(0, MaxInstBytesNum); @@ -179,6 +185,17 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address); } while (false); + if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || + MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si || + MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) { + // Insert dummy unused src2_modifiers. + int Src2ModIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::src2_modifiers); + auto I = MI.begin(); + std::advance(I, Src2ModIdx); + MI.insert(I, MCOperand::createImm(0)); + } + Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0; return Res; } @@ -263,6 +280,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const { return decodeSrcOp(OPW16, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const { + return decodeSrcOp(OPWV216, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { // Some instructions have operand restrictions beyond what the encoding // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra @@ -423,6 +444,7 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) { case OPW64: return MCOperand::createImm(getInlineImmVal64(Imm)); case OPW16: + case OPWV216: return MCOperand::createImm(getInlineImmVal16(Imm)); default: llvm_unreachable("implement me"); @@ -436,6 +458,7 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { default: // fall case OPW32: case OPW16: + case OPWV216: return VGPR_32RegClassID; case OPW64: return VReg_64RegClassID; case OPW128: return VReg_128RegClassID; @@ -449,6 +472,7 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { default: // fall case OPW32: case OPW16: + case OPWV216: return SGPR_32RegClassID; case OPW64: return SGPR_64RegClassID; case OPW128: return SGPR_128RegClassID; @@ -462,6 +486,7 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { default: // fall case OPW32: case OPW16: + case OPWV216: return TTMP_32RegClassID; case OPW64: return TTMP_64RegClassID; case OPW128: return TTMP_128RegClassID; @@ -497,6 +522,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c switch (Width) { case OPW32: case OPW16: + case OPWV216: return decodeSpecialReg32(Val); case OPW64: return decodeSpecialReg64(Val); @@ -522,6 +548,11 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { case 124: return createRegOperand(M0); case 126: return createRegOperand(EXEC_LO); case 127: return createRegOperand(EXEC_HI); + case 235: return createRegOperand(SRC_SHARED_BASE); + case 236: return createRegOperand(SRC_SHARED_LIMIT); + case 237: return createRegOperand(SRC_PRIVATE_BASE); + case 238: return createRegOperand(SRC_PRIVATE_LIMIT); + // TODO: SRC_POPS_EXITING_WAVE_ID // ToDo: no support for vccz register case 251: break; // ToDo: no support for execz register diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index ee5883a984e0..d50665187e10 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -67,6 +67,7 @@ public: MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; MCOperand decodeOperand_VSrc16(unsigned Val) const; + MCOperand decodeOperand_VSrcV216(unsigned Val) const; MCOperand decodeOperand_VReg_64(unsigned Val) const; MCOperand decodeOperand_VReg_96(unsigned Val) const; @@ -85,6 +86,7 @@ public: OPW64, OPW128, OPW16, + OPWV216, OPW_LAST_, OPW_FIRST_ = OPW32 }; diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td index 48c6592ca5b2..5480110d8315 100644 --- a/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/lib/Target/AMDGPU/EvergreenInstructions.td @@ -35,28 +35,59 @@ class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins, "MEM_RAT_CACHELESS "#name, pattern>; -class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name, - list<dag> pattern> - : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, +class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins, + dag outs, string name, list<dag> pattern> + : EG_CF_RAT <0x56, rat_inst, rat_id, mask, outs, ins, "MEM_RAT "#name, pattern>; class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop> - : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr, - i32imm:$rat_id, InstFlag:$eop), + : CF_MEM_RAT <0x1, ?, 0xf, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr, + i32imm:$rat_id, InstFlag:$eop), (outs), "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr" #!if(has_eop, ", $eop", ""), [(int_r600_rat_store_typed R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr, (i32 imm:$rat_id))]>; -def RAT_MSKOR : CF_MEM_RAT <0x11, 0, - (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), +def RAT_MSKOR : CF_MEM_RAT <0x11, 0, 0xf, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), (outs), "MSKOR $rw_gpr.XW, $index_gpr", [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)] > { let eop = 0; } + +multiclass RAT_ATOMIC<bits<6> op_ret, bits<6> op_noret, string name> { + let Constraints = "$rw_gpr = $out_gpr", eop = 0, mayStore = 1 in { + def _RTN: CF_MEM_RAT <op_ret, 0, 0xf, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), + (outs R600_Reg128:$out_gpr), + name ## "_RTN" ## " $rw_gpr, $index_gpr", [] >; + def _NORET: CF_MEM_RAT <op_noret, 0, 0xf, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), + (outs R600_Reg128:$out_gpr), + name ## " $rw_gpr, $index_gpr", [] >; + } +} + +// Swap no-ret is just store. Raw store to cached target +// can only store on dword, which exactly matches swap_no_ret. +defm RAT_ATOMIC_XCHG_INT : RAT_ATOMIC<1, 34, "ATOMIC_XCHG_INT">; +defm RAT_ATOMIC_CMPXCHG_INT : RAT_ATOMIC<4, 36, "ATOMIC_CMPXCHG_INT">; +defm RAT_ATOMIC_ADD : RAT_ATOMIC<7, 39, "ATOMIC_ADD">; +defm RAT_ATOMIC_SUB : RAT_ATOMIC<8, 40, "ATOMIC_SUB">; +defm RAT_ATOMIC_RSUB : RAT_ATOMIC<9, 41, "ATOMIC_RSUB">; +defm RAT_ATOMIC_MIN_INT : RAT_ATOMIC<10, 42, "ATOMIC_MIN_INT">; +defm RAT_ATOMIC_MIN_UINT : RAT_ATOMIC<11, 43, "ATOMIC_MIN_UINT">; +defm RAT_ATOMIC_MAX_INT : RAT_ATOMIC<12, 44, "ATOMIC_MAX_INT">; +defm RAT_ATOMIC_MAX_UINT : RAT_ATOMIC<13, 45, "ATOMIC_MAX_UINT">; +defm RAT_ATOMIC_AND : RAT_ATOMIC<14, 46, "ATOMIC_AND">; +defm RAT_ATOMIC_OR : RAT_ATOMIC<15, 47, "ATOMIC_OR">; +defm RAT_ATOMIC_XOR : RAT_ATOMIC<16, 48, "ATOMIC_XOR">; +defm RAT_ATOMIC_INC_UINT : RAT_ATOMIC<18, 50, "ATOMIC_INC_UINT">; +defm RAT_ATOMIC_DEC_UINT : RAT_ATOMIC<19, 51, "ATOMIC_DEC_UINT">; + } // End let Predicates = [isEGorCayman] //===----------------------------------------------------------------------===// @@ -257,6 +288,76 @@ def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), let Predicates = [isEGorCayman] in { +multiclass AtomicPat<Instruction inst_ret, Instruction inst_noret, + SDPatternOperator node_ret, SDPatternOperator node_noret> { + // FIXME: Add _RTN version. We need per WI scratch location to store the old value + // EXTRACT_SUBREG here is dummy, we know the node has no uses + def : Pat<(i32 (node_noret i32:$ptr, i32:$data)), + (EXTRACT_SUBREG (inst_noret + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $data, sub0), $ptr), sub1)>; +} +multiclass AtomicIncDecPat<Instruction inst_ret, Instruction inst_noret, + SDPatternOperator node_ret, SDPatternOperator node_noret, int C> { + // FIXME: Add _RTN version. We need per WI scratch location to store the old value + // EXTRACT_SUBREG here is dummy, we know the node has no uses + def : Pat<(i32 (node_noret i32:$ptr, C)), + (EXTRACT_SUBREG (inst_noret + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (MOV_IMM_I32 -1), sub0), $ptr), sub1)>; +} + +// CMPSWAP is pattern is special +// EXTRACT_SUBREG here is dummy, we know the node has no uses +// FIXME: Add _RTN version. We need per WI scratch location to store the old value +def : Pat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$data)), + (EXTRACT_SUBREG (RAT_ATOMIC_CMPXCHG_INT_NORET + (INSERT_SUBREG + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $cmp, sub3), + $data, sub0), + $ptr), sub1)>; + +defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_RTN, + RAT_ATOMIC_XCHG_INT_NORET, + atomic_swap_global_ret, + atomic_swap_global_noret>; +defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_RTN, RAT_ATOMIC_ADD_NORET, + atomic_add_global_ret, atomic_add_global_noret>; +defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_RTN, RAT_ATOMIC_SUB_NORET, + atomic_sub_global_ret, atomic_sub_global_noret>; +defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_RTN, + RAT_ATOMIC_MIN_INT_NORET, + atomic_min_global_ret, atomic_min_global_noret>; +defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_RTN, + RAT_ATOMIC_MIN_UINT_NORET, + atomic_umin_global_ret, atomic_umin_global_noret>; +defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_RTN, + RAT_ATOMIC_MAX_INT_NORET, + atomic_max_global_ret, atomic_max_global_noret>; +defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_RTN, + RAT_ATOMIC_MAX_UINT_NORET, + atomic_umax_global_ret, atomic_umax_global_noret>; +defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_RTN, RAT_ATOMIC_AND_NORET, + atomic_and_global_ret, atomic_and_global_noret>; +defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_RTN, RAT_ATOMIC_OR_NORET, + atomic_or_global_ret, atomic_or_global_noret>; +defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_RTN, RAT_ATOMIC_XOR_NORET, + atomic_xor_global_ret, atomic_xor_global_noret>; +defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN, + RAT_ATOMIC_INC_UINT_NORET, + atomic_add_global_ret, + atomic_add_global_noret, 1>; +defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN, + RAT_ATOMIC_INC_UINT_NORET, + atomic_sub_global_ret, + atomic_sub_global_noret, -1>; +defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN, + RAT_ATOMIC_DEC_UINT_NORET, + atomic_add_global_ret, + atomic_add_global_noret, -1>; +defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN, + RAT_ATOMIC_DEC_UINT_NORET, + atomic_sub_global_ret, + atomic_sub_global_noret, 1>; + // Should be predicated on FeatureFP64 // def FMA_64 : R600_3OP < // 0xA, "FMA_64", @@ -287,7 +388,7 @@ def BFE_INT_eg : R600_3OP <0x5, "BFE_INT", VecALU >; -def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>; +defm : BFEPattern <BFE_UINT_eg, BFE_INT_eg, MOV_IMM_I32>; def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))], @@ -337,7 +438,7 @@ defm CUBE_eg : CUBE_Common<0xC0>; def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; -def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>; +def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", AMDGPUfp_to_f16, VecALU>; def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>; def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index 849fb8ad50f5..b0ac0e689a0b 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -136,7 +136,7 @@ multiclass FLAT_Atomic_Pseudo< class flat_binary_atomic_op<SDNode atomic_op> : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}] + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.FLAT_ADDRESS;}] >; def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>; @@ -284,16 +284,16 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ auto const AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; + return AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS; }]>; class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr), (st node:$val, node:$ptr), [{ auto const AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::GLOBAL_ADDRESS; + return AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.GLOBAL_ADDRESS; }]>; def atomic_flat_load : flat_ld <atomic_load>; diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index dd3b46f13921..80fc4ac9d2a3 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -11,11 +11,24 @@ // //===----------------------------------------------------------------------===// -#include "GCNHazardRecognizer.h" #include "AMDGPUSubtarget.h" +#include "GCNHazardRecognizer.h" +#include "SIDefines.h" #include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/ScheduleDAG.h" -#include "llvm/Support/Debug.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/ErrorHandling.h" +#include <algorithm> +#include <cassert> +#include <limits> +#include <set> +#include <vector> using namespace llvm; @@ -26,7 +39,8 @@ using namespace llvm; GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : CurrCycleInstr(nullptr), MF(MF), - ST(MF.getSubtarget<SISubtarget>()) { + ST(MF.getSubtarget<SISubtarget>()), + TII(*ST.getInstrInfo()) { MaxLookAhead = 5; } @@ -58,8 +72,19 @@ static bool isRFE(unsigned Opcode) { return Opcode == AMDGPU::S_RFE_B64; } -static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { +static bool isSMovRel(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::S_MOVRELS_B32: + case AMDGPU::S_MOVRELS_B64: + case AMDGPU::S_MOVRELD_B32: + case AMDGPU::S_MOVRELD_B64: + return true; + default: + return false; + } +} +static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; @@ -96,6 +121,13 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) return NoopHazard; + if ((TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && + checkReadM0Hazards(MI) > 0) + return NoopHazard; + + if (checkAnyInstHazards(MI) > 0) + return NoopHazard; + return NoHazard; } @@ -104,11 +136,13 @@ unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { } unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { + int WaitStates = std::max(0, checkAnyInstHazards(MI)); + if (SIInstrInfo::isSMRD(*MI)) - return std::max(0, checkSMRDHazards(MI)); + return std::max(WaitStates, checkSMRDHazards(MI)); if (SIInstrInfo::isVALU(*MI)) { - int WaitStates = std::max(0, checkVALUHazards(MI)); + WaitStates = std::max(WaitStates, checkVALUHazards(MI)); if (SIInstrInfo::isVMEM(*MI)) WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); @@ -122,19 +156,25 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { if (isRWLane(MI->getOpcode())) WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); + if (TII.isVINTRP(*MI)) + WaitStates = std::max(WaitStates, checkReadM0Hazards(MI)); + return WaitStates; } if (isSGetReg(MI->getOpcode())) - return std::max(0, checkGetRegHazards(MI)); + return std::max(WaitStates, checkGetRegHazards(MI)); if (isSSetReg(MI->getOpcode())) - return std::max(0, checkSetRegHazards(MI)); + return std::max(WaitStates, checkSetRegHazards(MI)); if (isRFE(MI->getOpcode())) - return std::max(0, checkRFEHazards(MI)); + return std::max(WaitStates, checkRFEHazards(MI)); - return 0; + if (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) + return std::max(WaitStates, checkReadM0Hazards(MI)); + + return WaitStates; } void GCNHazardRecognizer::EmitNoop() { @@ -142,14 +182,12 @@ void GCNHazardRecognizer::EmitNoop() { } void GCNHazardRecognizer::AdvanceCycle() { - // When the scheduler detects a stall, it will call AdvanceCycle() without // emitting any instructions. if (!CurrCycleInstr) return; - const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr); + unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); // Keep track of emitted instructions EmittedInstrs.push_front(CurrCycleInstr); @@ -180,7 +218,6 @@ void GCNHazardRecognizer::RecedeCycle() { int GCNHazardRecognizer::getWaitStatesSince( function_ref<bool(MachineInstr *)> IsHazard) { - int WaitStates = -1; for (MachineInstr *MI : EmittedInstrs) { ++WaitStates; @@ -204,7 +241,6 @@ int GCNHazardRecognizer::getWaitStatesSinceDef( int GCNHazardRecognizer::getWaitStatesSinceSetReg( function_ref<bool(MachineInstr *)> IsHazard) { - auto IsHazardFn = [IsHazard] (MachineInstr *MI) { return isSSetReg(MI->getOpcode()) && IsHazard(MI); }; @@ -281,7 +317,6 @@ int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) { int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); int WaitStatesNeeded = 0; WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD); @@ -293,7 +328,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { // A read of an SGPR by SMRD instruction requires 4 wait states when the // SGPR was written by a VALU instruction. int SmrdSgprWaitStates = 4; - auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; for (const MachineOperand &Use : SMRD->uses()) { if (!Use.isReg()) @@ -486,7 +521,6 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { } int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { - if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return 0; @@ -500,3 +534,42 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); return RFEWaitStates - WaitStatesNeeded; } + +int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) { + if (MI->isDebugValue()) + return 0; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + if (!ST.hasSMovFedHazard()) + return 0; + + // Check for any instruction reading an SGPR after a write from + // s_mov_fed_b32. + int MovFedWaitStates = 1; + int WaitStatesNeeded = 0; + + for (const MachineOperand &Use : MI->uses()) { + if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + auto IsHazardFn = [] (MachineInstr *MI) { + return MI->getOpcode() == AMDGPU::S_MOV_FED_B32; + }; + int WaitStatesNeededForUse = + MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { + if (!ST.hasReadM0Hazard()) + return 0; + + const SIInstrInfo *TII = ST.getInstrInfo(); + int SMovRelWaitStates = 1; + auto IsHazardFn = [TII] (MachineInstr *MI) { + return TII->isSALU(*MI); + }; + return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn); +} diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h index 0ab82ff4635b..5680c3de6a1a 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -34,6 +34,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { std::list<MachineInstr*> EmittedInstrs; const MachineFunction &MF; const SISubtarget &ST; + const SIInstrInfo &TII; int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard); int getWaitStatesSinceDef(unsigned Reg, @@ -52,6 +53,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { int checkVALUHazards(MachineInstr *VALU); int checkRWLaneHazards(MachineInstr *RWLane); int checkRFEHazards(MachineInstr *RFE); + int checkAnyInstHazards(MachineInstr *MI); + int checkReadM0Hazards(MachineInstr *SMovRel); public: GCNHazardRecognizer(const MachineFunction &MF); // We can only issue one instruction per cycle. diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp new file mode 100644 index 000000000000..3bb5c9bc22b7 --- /dev/null +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -0,0 +1,528 @@ +//===--------------------- GCNIterativeScheduler.cpp - --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "GCNIterativeScheduler.h" +#include "GCNSchedStrategy.h" +#include "SIMachineFunctionInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +namespace llvm { + std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots, + const ScheduleDAG &DAG); +} + +// shim accessors for different order containers +static inline MachineInstr *getMachineInstr(MachineInstr *MI) { + return MI; +} +static inline MachineInstr *getMachineInstr(const SUnit *SU) { + return SU->getInstr(); +} +static inline MachineInstr *getMachineInstr(const SUnit &SU) { + return SU.getInstr(); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +static void printRegion(raw_ostream &OS, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + const LiveIntervals *LIS, + unsigned MaxInstNum = + std::numeric_limits<unsigned>::max()) { + auto BB = Begin->getParent(); + OS << BB->getParent()->getName() << ":BB#" << BB->getNumber() + << ' ' << BB->getName() << ":\n"; + auto I = Begin; + MaxInstNum = std::max(MaxInstNum, 1u); + for (; I != End && MaxInstNum; ++I, --MaxInstNum) { + if (!I->isDebugValue() && LIS) + OS << LIS->getInstructionIndex(*I); + OS << '\t' << *I; + } + if (I != End) { + OS << "\t...\n"; + I = std::prev(End); + if (!I->isDebugValue() && LIS) + OS << LIS->getInstructionIndex(*I); + OS << '\t' << *I; + } + if (End != BB->end()) { // print boundary inst if present + OS << "----\n"; + if (LIS) OS << LIS->getInstructionIndex(*End) << '\t'; + OS << *End; + } +} + +LLVM_DUMP_METHOD +static void printLivenessInfo(raw_ostream &OS, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + const LiveIntervals *LIS) { + const auto BB = Begin->getParent(); + const auto &MRI = BB->getParent()->getRegInfo(); + + const auto LiveIns = getLiveRegsBefore(*Begin, *LIS); + OS << "LIn RP: "; + getRegPressure(MRI, LiveIns).print(OS); + + const auto BottomMI = End == BB->end() ? std::prev(End) : End; + const auto LiveOuts = getLiveRegsAfter(*BottomMI, *LIS); + OS << "LOt RP: "; + getRegPressure(MRI, LiveOuts).print(OS); +} + +LLVM_DUMP_METHOD +void GCNIterativeScheduler::printRegions(raw_ostream &OS) const { + const auto &ST = MF.getSubtarget<SISubtarget>(); + for (const auto R : Regions) { + OS << "Region to schedule "; + printRegion(OS, R->Begin, R->End, LIS, 1); + printLivenessInfo(OS, R->Begin, R->End, LIS); + OS << "Max RP: "; + R->MaxPressure.print(OS, &ST); + } +} + +LLVM_DUMP_METHOD +void GCNIterativeScheduler::printSchedResult(raw_ostream &OS, + const Region *R, + const GCNRegPressure &RP) const { + OS << "\nAfter scheduling "; + printRegion(OS, R->Begin, R->End, LIS); + printSchedRP(OS, R->MaxPressure, RP); + OS << '\n'; +} + +LLVM_DUMP_METHOD +void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, + const GCNRegPressure &Before, + const GCNRegPressure &After) const { + const auto &ST = MF.getSubtarget<SISubtarget>(); + OS << "RP before: "; + Before.print(OS, &ST); + OS << "RP after: "; + After.print(OS, &ST); +} + +#endif + +// DAG builder helper +class GCNIterativeScheduler::BuildDAG { + GCNIterativeScheduler &Sch; + SmallVector<SUnit*, 8> TopRoots; +public: + BuildDAG(const Region &R, GCNIterativeScheduler &_Sch) + : Sch(_Sch) { + auto BB = R.Begin->getParent(); + Sch.BaseClass::startBlock(BB); + Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); + + Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr, + /*TrackLaneMask*/true); + Sch.Topo.InitDAGTopologicalSorting(); + + SmallVector<SUnit*, 8> BotRoots; + Sch.findRootsAndBiasEdges(TopRoots, BotRoots); + } + ~BuildDAG() { + Sch.BaseClass::exitRegion(); + Sch.BaseClass::finishBlock(); + } + ArrayRef<const SUnit*> getTopRoots() const { + return TopRoots; + } +}; + +class GCNIterativeScheduler::OverrideLegacyStrategy { + GCNIterativeScheduler &Sch; + Region &Rgn; + std::unique_ptr<MachineSchedStrategy> SaveSchedImpl; + GCNRegPressure SaveMaxRP; +public: + OverrideLegacyStrategy(Region &R, + MachineSchedStrategy &OverrideStrategy, + GCNIterativeScheduler &_Sch) + : Sch(_Sch) + , Rgn(R) + , SaveSchedImpl(std::move(_Sch.SchedImpl)) + , SaveMaxRP(R.MaxPressure) { + Sch.SchedImpl.reset(&OverrideStrategy); + auto BB = R.Begin->getParent(); + Sch.BaseClass::startBlock(BB); + Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); + } + ~OverrideLegacyStrategy() { + Sch.BaseClass::exitRegion(); + Sch.BaseClass::finishBlock(); + Sch.SchedImpl.release(); + Sch.SchedImpl = std::move(SaveSchedImpl); + } + void schedule() { + assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); + DEBUG(dbgs() << "\nScheduling "; + printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2)); + Sch.BaseClass::schedule(); + + // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore + Sch.RegionEnd = Rgn.End; + //assert(Rgn.End == Sch.RegionEnd); + Rgn.Begin = Sch.RegionBegin; + Rgn.MaxPressure.clear(); + } + void restoreOrder() { + assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); + // DAG SUnits are stored using original region's order + // so just use SUnits as the restoring schedule + Sch.scheduleRegion(Rgn, Sch.SUnits, SaveMaxRP); + } +}; + +// just a stub to make base class happy +class SchedStrategyStub : public MachineSchedStrategy { +public: + bool shouldTrackPressure() const override { return false; } + bool shouldTrackLaneMasks() const override { return false; } + void initialize(ScheduleDAGMI *DAG) override {} + SUnit *pickNode(bool &IsTopNode) override { return nullptr; } + void schedNode(SUnit *SU, bool IsTopNode) override {} + void releaseTopNode(SUnit *SU) override {} + void releaseBottomNode(SUnit *SU) override {} +}; + +GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C, + StrategyKind S) + : BaseClass(C, llvm::make_unique<SchedStrategyStub>()) + , Context(C) + , Strategy(S) + , UPTracker(*LIS) { +} + +// returns max pressure for a region +GCNRegPressure +GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End) + const { + // For the purpose of pressure tracking bottom inst of the region should + // be also processed. End is either BB end, BB terminator inst or sched + // boundary inst. + auto const BBEnd = Begin->getParent()->end(); + auto const BottomMI = End == BBEnd ? std::prev(End) : End; + + // scheduleRegions walks bottom to top, so its likely we just get next + // instruction to track + auto AfterBottomMI = std::next(BottomMI); + if (AfterBottomMI == BBEnd || + &*AfterBottomMI != UPTracker.getLastTrackedMI()) { + UPTracker.reset(*BottomMI); + } else { + assert(UPTracker.isValid()); + } + + for (auto I = BottomMI; I != Begin; --I) + UPTracker.recede(*I); + + UPTracker.recede(*Begin); + + assert(UPTracker.isValid() || + (dbgs() << "Tracked region ", + printRegion(dbgs(), Begin, End, LIS), false)); + return UPTracker.moveMaxPressure(); +} + +// returns max pressure for a tentative schedule +template <typename Range> GCNRegPressure +GCNIterativeScheduler::getSchedulePressure(const Region &R, + Range &&Schedule) const { + auto const BBEnd = R.Begin->getParent()->end(); + GCNUpwardRPTracker RPTracker(*LIS); + if (R.End != BBEnd) { + // R.End points to the boundary instruction but the + // schedule doesn't include it + RPTracker.reset(*R.End); + RPTracker.recede(*R.End); + } else { + // R.End doesn't point to the boundary instruction + RPTracker.reset(*std::prev(BBEnd)); + } + for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) { + RPTracker.recede(*getMachineInstr(*--I)); + } + return RPTracker.moveMaxPressure(); +} + +void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) { + BaseClass::enterRegion(BB, Begin, End, NumRegionInstrs); + if (NumRegionInstrs > 2) { + Regions.push_back( + new (Alloc.Allocate()) + Region { Begin, End, NumRegionInstrs, + getRegionPressure(Begin, End), nullptr }); + } +} + +void GCNIterativeScheduler::schedule() { // overriden + // do nothing + DEBUG( + printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS); + if (!Regions.empty() && Regions.back()->Begin == RegionBegin) { + dbgs() << "Max RP: "; + Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>()); + } + dbgs() << '\n'; + ); +} + +void GCNIterativeScheduler::finalizeSchedule() { // overriden + if (Regions.empty()) + return; + switch (Strategy) { + case SCHEDULE_MINREGONLY: scheduleMinReg(); break; + case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break; + case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break; + } +} + +// Detach schedule from SUnits and interleave it with debug values. +// Returned schedule becomes independent of DAG state. +std::vector<MachineInstr*> +GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const { + std::vector<MachineInstr*> Res; + Res.reserve(Schedule.size() * 2); + + if (FirstDbgValue) + Res.push_back(FirstDbgValue); + + const auto DbgB = DbgValues.begin(), DbgE = DbgValues.end(); + for (auto SU : Schedule) { + Res.push_back(SU->getInstr()); + const auto &D = std::find_if(DbgB, DbgE, [SU](decltype(*DbgB) &P) { + return P.second == SU->getInstr(); + }); + if (D != DbgE) + Res.push_back(D->first); + } + return Res; +} + +void GCNIterativeScheduler::setBestSchedule(Region &R, + ScheduleRef Schedule, + const GCNRegPressure &MaxRP) { + R.BestSchedule.reset( + new TentativeSchedule{ detachSchedule(Schedule), MaxRP }); +} + +void GCNIterativeScheduler::scheduleBest(Region &R) { + assert(R.BestSchedule.get() && "No schedule specified"); + scheduleRegion(R, R.BestSchedule->Schedule, R.BestSchedule->MaxPressure); + R.BestSchedule.reset(); +} + +// minimal required region scheduler, works for ranges of SUnits*, +// SUnits or MachineIntrs* +template <typename Range> +void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, + const GCNRegPressure &MaxRP) { + assert(RegionBegin == R.Begin && RegionEnd == R.End); + assert(LIS != nullptr); +#ifndef NDEBUG + const auto SchedMaxRP = getSchedulePressure(R, Schedule); +#endif + auto BB = R.Begin->getParent(); + auto Top = R.Begin; + for (const auto &I : Schedule) { + auto MI = getMachineInstr(I); + if (MI != &*Top) { + BB->remove(MI); + BB->insert(Top, MI); + if (!MI->isDebugValue()) + LIS->handleMove(*MI, true); + } + if (!MI->isDebugValue()) { + // Reset read - undef flags and update them later. + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef()) + Op.setIsUndef(false); + + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true, + /*IgnoreDead*/false); + // Adjust liveness and add missing dead+read-undef flags. + auto SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); + } + Top = std::next(MI->getIterator()); + } + RegionBegin = getMachineInstr(Schedule.front()); + + // Schedule consisting of MachineInstr* is considered 'detached' + // and already interleaved with debug values + if (!std::is_same<decltype(*Schedule.begin()), MachineInstr*>::value) { + placeDebugValues(); + // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore + //assert(R.End == RegionEnd); + RegionEnd = R.End; + } + + R.Begin = RegionBegin; + R.MaxPressure = MaxRP; + +#ifndef NDEBUG + const auto RegionMaxRP = getRegionPressure(R); + const auto &ST = MF.getSubtarget<SISubtarget>(); +#endif + assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP)) + || (dbgs() << "Max RP mismatch!!!\n" + "RP for schedule (calculated): ", + SchedMaxRP.print(dbgs(), &ST), + dbgs() << "RP for schedule (reported): ", + MaxRP.print(dbgs(), &ST), + dbgs() << "RP after scheduling: ", + RegionMaxRP.print(dbgs(), &ST), + false)); +} + +// Sort recorded regions by pressure - highest at the front +void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) { + const auto &ST = MF.getSubtarget<SISubtarget>(); + std::sort(Regions.begin(), Regions.end(), + [&ST, TargetOcc](const Region *R1, const Region *R2) { + return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc); + }); +} + +/////////////////////////////////////////////////////////////////////////////// +// Legacy MaxOccupancy Strategy + +// Tries to increase occupancy applying minreg scheduler for a sequence of +// most demanding regions. Obtained schedules are saved as BestSchedule for a +// region. +// TargetOcc is the best achievable occupancy for a kernel. +// Returns better occupancy on success or current occupancy on fail. +// BestSchedules aren't deleted on fail. +unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { + // TODO: assert Regions are sorted descending by pressure + const auto &ST = MF.getSubtarget<SISubtarget>(); + const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); + DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc + << ", current = " << Occ << '\n'); + + auto NewOcc = TargetOcc; + for (auto R : Regions) { + if (R->MaxPressure.getOccupancy(ST) >= NewOcc) + break; + + DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); + printLivenessInfo(dbgs(), R->Begin, R->End, LIS)); + + BuildDAG DAG(*R, *this); + const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); + const auto MaxRP = getSchedulePressure(*R, MinSchedule); + DEBUG(dbgs() << "Occupancy improvement attempt:\n"; + printSchedRP(dbgs(), R->MaxPressure, MaxRP)); + + NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST)); + if (NewOcc <= Occ) + break; + + setBestSchedule(*R, MinSchedule, MaxRP); + } + DEBUG(dbgs() << "New occupancy = " << NewOcc + << ", prev occupancy = " << Occ << '\n'); + return std::max(NewOcc, Occ); +} + +void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( + bool TryMaximizeOccupancy) { + const auto &ST = MF.getSubtarget<SISubtarget>(); + auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + + sortRegionsByPressure(TgtOcc); + auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); + + if (TryMaximizeOccupancy && Occ < TgtOcc) + Occ = tryMaximizeOccupancy(TgtOcc); + + // This is really weird but for some magic scheduling regions twice + // gives performance improvement + const int NumPasses = Occ < TgtOcc ? 2 : 1; + + TgtOcc = std::min(Occ, TgtOcc); + DEBUG(dbgs() << "Scheduling using default scheduler, " + "target occupancy = " << TgtOcc << '\n'); + GCNMaxOccupancySchedStrategy LStrgy(Context); + + for (int I = 0; I < NumPasses; ++I) { + // running first pass with TargetOccupancy = 0 mimics previous scheduling + // approach and is a performance magic + LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc); + for (auto R : Regions) { + OverrideLegacyStrategy Ovr(*R, LStrgy, *this); + + Ovr.schedule(); + const auto RP = getRegionPressure(*R); + DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); + + if (RP.getOccupancy(ST) < TgtOcc) { + DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); + if (R->BestSchedule.get() && + R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) { + DEBUG(dbgs() << ", scheduling minimal register\n"); + scheduleBest(*R); + } else { + DEBUG(dbgs() << ", restoring\n"); + Ovr.restoreOrder(); + assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc); + } + } + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Minimal Register Strategy + +void GCNIterativeScheduler::scheduleMinReg(bool force) { + const auto &ST = MF.getSubtarget<SISubtarget>(); + const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + sortRegionsByPressure(TgtOcc); + + auto MaxPressure = Regions.front()->MaxPressure; + for (auto R : Regions) { + if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc)) + break; + + BuildDAG DAG(*R, *this); + const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); + + const auto RP = getSchedulePressure(*R, MinSchedule); + DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) { + dbgs() << "\nWarning: Pressure becomes worse after minreg!"; + printSchedRP(dbgs(), R->MaxPressure, RP); + }); + + if (!force && MaxPressure.less(ST, RP, TgtOcc)) + break; + + scheduleRegion(*R, MinSchedule, RP); + DEBUG(printSchedResult(dbgs(), R, RP)); + + MaxPressure = RP; + } +} diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.h b/lib/Target/AMDGPU/GCNIterativeScheduler.h new file mode 100644 index 000000000000..df3afce21ebc --- /dev/null +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -0,0 +1,118 @@ +//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H +#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H + +#include "GCNRegPressure.h" + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class GCNIterativeScheduler : public ScheduleDAGMILive { + typedef ScheduleDAGMILive BaseClass; +public: + enum StrategyKind { + SCHEDULE_MINREGONLY, + SCHEDULE_MINREGFORCED, + SCHEDULE_LEGACYMAXOCCUPANCY + }; + + GCNIterativeScheduler(MachineSchedContext *C, + StrategyKind S); + + void schedule() override; + + void enterRegion(MachineBasicBlock *BB, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned RegionInstrs) override; + + void finalizeSchedule() override; + +protected: + + typedef ArrayRef<const SUnit*> ScheduleRef; + + struct TentativeSchedule { + std::vector<MachineInstr*> Schedule; + GCNRegPressure MaxPressure; + }; + + struct Region { + // Fields except for BestSchedule are supposed to reflect current IR state + // `const` fields are to emphasize they shouldn't change for any schedule. + MachineBasicBlock::iterator Begin; + // End is either a boundary instruction or end of basic block + const MachineBasicBlock::iterator End; + const unsigned NumRegionInstrs; + GCNRegPressure MaxPressure; + + // best schedule for the region so far (not scheduled yet) + std::unique_ptr<TentativeSchedule> BestSchedule; + }; + + SpecificBumpPtrAllocator<Region> Alloc; + std::vector<Region*> Regions; + + MachineSchedContext *Context; + const StrategyKind Strategy; + mutable GCNUpwardRPTracker UPTracker; + + class BuildDAG; + class OverrideLegacyStrategy; + + template <typename Range> + GCNRegPressure getSchedulePressure(const Region &R, + Range &&Schedule) const; + + GCNRegPressure getRegionPressure(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End) const; + + GCNRegPressure getRegionPressure(const Region &R) const { + return getRegionPressure(R.Begin, R.End); + } + + void setBestSchedule(Region &R, + ScheduleRef Schedule, + const GCNRegPressure &MaxRP = GCNRegPressure()); + + void scheduleBest(Region &R); + + std::vector<MachineInstr*> detachSchedule(ScheduleRef Schedule) const; + + void sortRegionsByPressure(unsigned TargetOcc); + + template <typename Range> + void scheduleRegion(Region &R, Range &&Schedule, + const GCNRegPressure &MaxRP = GCNRegPressure()); + + unsigned tryMaximizeOccupancy(unsigned TargetOcc = + std::numeric_limits<unsigned>::max()); + + void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true); + void scheduleMinReg(bool force = false); + + void printRegions(raw_ostream &OS) const; + void printSchedResult(raw_ostream &OS, + const Region *R, + const GCNRegPressure &RP) const; + void printSchedRP(raw_ostream &OS, + const GCNRegPressure &Before, + const GCNRegPressure &After) const; +}; + +} // End namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp new file mode 100644 index 000000000000..c6d0f2179950 --- /dev/null +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -0,0 +1,266 @@ +//===----------------------- GCNMinRegStrategy.cpp - ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ScheduleDAG.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +class GCNMinRegScheduler { + struct Candidate : ilist_node<Candidate> { + const SUnit *SU; + int Priority; + + Candidate(const SUnit *SU_, int Priority_ = 0) + : SU(SU_), Priority(Priority_) {} + }; + + SpecificBumpPtrAllocator<Candidate> Alloc; + typedef simple_ilist<Candidate> Queue; + Queue RQ; // Ready queue + + std::vector<unsigned> NumPreds; + + bool isScheduled(const SUnit *SU) const { + assert(!SU->isBoundaryNode()); + return NumPreds[SU->NodeNum] == std::numeric_limits<unsigned>::max(); + } + + void setIsScheduled(const SUnit *SU) { + assert(!SU->isBoundaryNode()); + NumPreds[SU->NodeNum] = std::numeric_limits<unsigned>::max(); + } + + unsigned getNumPreds(const SUnit *SU) const { + assert(!SU->isBoundaryNode()); + assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max()); + return NumPreds[SU->NodeNum]; + } + + unsigned decNumPreds(const SUnit *SU) { + assert(!SU->isBoundaryNode()); + assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max()); + return --NumPreds[SU->NodeNum]; + } + + void initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits); + + int getReadySuccessors(const SUnit *SU) const; + int getNotReadySuccessors(const SUnit *SU) const; + + template <typename Calc> + unsigned findMax(unsigned Num, Calc C); + + Candidate* pickCandidate(); + + void bumpPredsPriority(const SUnit *SchedSU, int Priority); + void releaseSuccessors(const SUnit* SU, int Priority); + +public: + std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots, + const ScheduleDAG &DAG); +}; + +void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) { + NumPreds.resize(SUnits.size()); + for (unsigned I = 0; I < SUnits.size(); ++I) + NumPreds[I] = SUnits[I].NumPredsLeft; +} + +int GCNMinRegScheduler::getReadySuccessors(const SUnit *SU) const { + unsigned NumSchedSuccs = 0; + for (auto SDep : SU->Succs) { + bool wouldBeScheduled = true; + for (auto PDep : SDep.getSUnit()->Preds) { + auto PSU = PDep.getSUnit(); + assert(!PSU->isBoundaryNode()); + if (PSU != SU && !isScheduled(PSU)) { + wouldBeScheduled = false; + break; + } + } + NumSchedSuccs += wouldBeScheduled ? 1 : 0; + } + return NumSchedSuccs; +} + +int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const { + return SU->Succs.size() - getReadySuccessors(SU); +} + +template <typename Calc> +unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) { + assert(!RQ.empty() && Num <= RQ.size()); + typedef decltype(C(*RQ.begin())) T; + T Max = std::numeric_limits<T>::min(); + unsigned NumMax = 0; + for (auto I = RQ.begin(); Num; --Num) { + T Cur = C(*I); + if (Cur >= Max) { + if (Cur > Max) { + Max = Cur; + NumMax = 1; + } else + ++NumMax; + auto &Cand = *I++; + RQ.remove(Cand); + RQ.push_front(Cand); + continue; + } + ++I; + } + return NumMax; +} + +GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() { + do { + unsigned Num = RQ.size(); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { return C.Priority; }); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { + auto SU = C.SU; + int Res = getNotReadySuccessors(SU); + DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready " + << Res << " successors, metric = " << -Res << '\n'); + return -Res; + }); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting most producing candidate among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { + auto SU = C.SU; + auto Res = getReadySuccessors(SU); + DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready " + << Res << " successors, metric = " << Res << '\n'); + return Res; + }); + if (Num == 1) break; + + Num = Num ? Num : RQ.size(); + DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; }); + assert(Num == 1); + } while (false); + + return &RQ.front(); +} + +void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) { + SmallPtrSet<const SUnit*, 32> Set; + for (const auto &S : SchedSU->Succs) { + if (S.getSUnit()->isBoundaryNode() || isScheduled(S.getSUnit()) || + S.getKind() != SDep::Data) + continue; + for (const auto &P : S.getSUnit()->Preds) { + auto PSU = P.getSUnit(); + assert(!PSU->isBoundaryNode()); + if (PSU != SchedSU && !isScheduled(PSU)) { + Set.insert(PSU); + } + } + } + SmallVector<const SUnit*, 32> Worklist(Set.begin(), Set.end()); + while (!Worklist.empty()) { + auto SU = Worklist.pop_back_val(); + assert(!SU->isBoundaryNode()); + for (const auto &P : SU->Preds) { + if (!P.getSUnit()->isBoundaryNode() && !isScheduled(P.getSUnit()) && + Set.insert(P.getSUnit()).second) + Worklist.push_back(P.getSUnit()); + } + } + DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum + << ")'s non-ready successors of " << Priority + << " priority in ready queue: "); + const auto SetEnd = Set.end(); + for (auto &C : RQ) { + if (Set.find(C.SU) != SetEnd) { + C.Priority = Priority; + DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')'); + } + } + DEBUG(dbgs() << '\n'); +} + +void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) { + for (const auto &S : SU->Succs) { + auto SuccSU = S.getSUnit(); + if (S.isWeak()) + continue; + assert(SuccSU->isBoundaryNode() || getNumPreds(SuccSU) > 0); + if (!SuccSU->isBoundaryNode() && decNumPreds(SuccSU) == 0) + RQ.push_front(*new (Alloc.Allocate()) Candidate(SuccSU, Priority)); + } +} + +std::vector<const SUnit*> +GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots, + const ScheduleDAG &DAG) { + const auto &SUnits = DAG.SUnits; + std::vector<const SUnit*> Schedule; + Schedule.reserve(SUnits.size()); + + initNumPreds(SUnits); + + int StepNo = 0; + + for (auto SU : TopRoots) { + RQ.push_back(*new (Alloc.Allocate()) Candidate(SU, StepNo)); + } + releaseSuccessors(&DAG.EntrySU, StepNo); + + while (!RQ.empty()) { + DEBUG( + dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n" + "Ready queue:"; + for (auto &C : RQ) + dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')'; + dbgs() << '\n'; + ); + + auto C = pickCandidate(); + assert(C); + RQ.remove(*C); + auto SU = C->SU; + DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); + + releaseSuccessors(SU, StepNo); + Schedule.push_back(SU); + setIsScheduled(SU); + + if (getReadySuccessors(SU) == 0) + bumpPredsPriority(SU, StepNo); + + ++StepNo; + } + assert(SUnits.size() == Schedule.size()); + + return Schedule; +} + +namespace llvm { +std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots, + const ScheduleDAG &DAG) { + GCNMinRegScheduler S; + return S.schedule(TopRoots, DAG); +} +} diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp new file mode 100644 index 000000000000..4ecfa118fb27 --- /dev/null +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -0,0 +1,355 @@ +//===------------------------- GCNRegPressure.cpp - -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "GCNRegPressure.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +void llvm::printLivesAt(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + dbgs() << "Live regs at " << SI << ": " + << *LIS.getInstructionFromIndex(SI); + unsigned Num = 0; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + const unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + if (MRI.reg_nodbg_empty(Reg)) + continue; + const auto &LI = LIS.getInterval(Reg); + if (LI.hasSubRanges()) { + bool firstTime = true; + for (const auto &S : LI.subranges()) { + if (!S.liveAt(SI)) continue; + if (firstTime) { + dbgs() << " " << PrintReg(Reg, MRI.getTargetRegisterInfo()) + << '\n'; + firstTime = false; + } + dbgs() << " " << S << '\n'; + ++Num; + } + } else if (LI.liveAt(SI)) { + dbgs() << " " << LI << '\n'; + ++Num; + } + } + if (!Num) dbgs() << " <none>\n"; +} + +static bool isEqual(const GCNRPTracker::LiveRegSet &S1, + const GCNRPTracker::LiveRegSet &S2) { + if (S1.size() != S2.size()) + return false; + + for (const auto &P : S1) { + auto I = S2.find(P.first); + if (I == S2.end() || I->second != P.second) + return false; + } + return true; +} + +static GCNRPTracker::LiveRegSet +stripEmpty(const GCNRPTracker::LiveRegSet &LR) { + GCNRPTracker::LiveRegSet Res; + for (const auto &P : LR) { + if (P.second.any()) + Res.insert(P); + } + return Res; +} +#endif + +/////////////////////////////////////////////////////////////////////////////// +// GCNRegPressure + +unsigned GCNRegPressure::getRegKind(unsigned Reg, + const MachineRegisterInfo &MRI) { + assert(TargetRegisterInfo::isVirtualRegister(Reg)); + const auto RC = MRI.getRegClass(Reg); + auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); + return STI->isSGPRClass(RC) ? + (RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) : + (RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE); +} + +void GCNRegPressure::inc(unsigned Reg, + LaneBitmask PrevMask, + LaneBitmask NewMask, + const MachineRegisterInfo &MRI) { + if (NewMask == PrevMask) + return; + + int Sign = 1; + if (NewMask < PrevMask) { + std::swap(NewMask, PrevMask); + Sign = -1; + } +#ifndef NDEBUG + const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg); +#endif + switch (auto Kind = getRegKind(Reg, MRI)) { + case SGPR32: + case VGPR32: + assert(PrevMask.none() && NewMask == MaxMask); + Value[Kind] += Sign; + break; + + case SGPR_TUPLE: + case VGPR_TUPLE: + assert(NewMask < MaxMask || NewMask == MaxMask); + assert(PrevMask < NewMask); + + Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] += + Sign * countPopulation((~PrevMask & NewMask).getAsInteger()); + + if (PrevMask.none()) { + assert(NewMask.any()); + Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight(); + } + break; + + default: llvm_unreachable("Unknown register kind"); + } +} + +bool GCNRegPressure::less(const SISubtarget &ST, + const GCNRegPressure& O, + unsigned MaxOccupancy) const { + const auto SGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumSGPRs(getSGRPNum())); + const auto VGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(getVGRPNum())); + const auto OtherSGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumSGPRs(O.getSGRPNum())); + const auto OtherVGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(O.getVGRPNum())); + + const auto Occ = std::min(SGPROcc, VGPROcc); + const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); + if (Occ != OtherOcc) + return Occ > OtherOcc; + + bool SGPRImportant = SGPROcc < VGPROcc; + const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc; + + // if both pressures disagree on what is more important compare vgprs + if (SGPRImportant != OtherSGPRImportant) { + SGPRImportant = false; + } + + // compare large regs pressure + bool SGPRFirst = SGPRImportant; + for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) { + if (SGPRFirst) { + auto SW = getSGPRTuplesWeight(); + auto OtherSW = O.getSGPRTuplesWeight(); + if (SW != OtherSW) + return SW < OtherSW; + } else { + auto VW = getVGPRTuplesWeight(); + auto OtherVW = O.getVGPRTuplesWeight(); + if (VW != OtherVW) + return VW < OtherVW; + } + } + return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()): + (getVGRPNum() < O.getVGRPNum()); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const { + OS << "VGPRs: " << getVGRPNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')'; + OS << ", SGPRs: " << getSGRPNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')'; + OS << ", LVGPR WT: " << getVGPRTuplesWeight() + << ", LSGPR WT: " << getSGPRTuplesWeight(); + if (ST) OS << " -> Occ: " << getOccupancy(*ST); + OS << '\n'; +} +#endif + +/////////////////////////////////////////////////////////////////////////////// +// GCNRPTracker + +LaneBitmask llvm::getLiveLaneMask(unsigned Reg, + SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + assert(!MRI.reg_nodbg_empty(Reg)); + LaneBitmask LiveMask; + const auto &LI = LIS.getInterval(Reg); + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) + if (S.liveAt(SI)) { + LiveMask |= S.LaneMask; + assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) || + LiveMask == MRI.getMaxLaneMaskForVReg(Reg)); + } + } else if (LI.liveAt(SI)) { + LiveMask = MRI.getMaxLaneMaskForVReg(Reg); + } + return LiveMask; +} + +GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + GCNRPTracker::LiveRegSet LiveRegs; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = TargetRegisterInfo::index2VirtReg(I); + if (MRI.reg_nodbg_empty(Reg)) + continue; + auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); + if (LiveMask.any()) + LiveRegs[Reg] = LiveMask; + } + return LiveRegs; +} + +void GCNUpwardRPTracker::reset(const MachineInstr &MI) { + MRI = &MI.getParent()->getParent()->getRegInfo(); + LiveRegs = getLiveRegsAfter(MI, LIS); + MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); +} + +LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const { + assert(MO.isDef() && MO.isReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())); + + // We don't rely on read-undef flag because in case of tentative schedule + // tracking it isn't set correctly yet. This works correctly however since + // use mask has been tracked before using LIS. + return MO.getSubReg() == 0 ? + MRI->getMaxLaneMaskForVReg(MO.getReg()) : + MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg()); +} + +LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const { + assert(MO.isUse() && MO.isReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())); + + if (auto SubReg = MO.getSubReg()) + return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); + + auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg()); + if (MaxMask.getAsInteger() == 1) // cannot have subregs + return MaxMask; + + // For a tentative schedule LIS isn't updated yet but livemask should remain + // the same on any schedule. Subreg defs can be reordered but they all must + // dominate uses anyway. + auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex(); + return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI); +} + +void GCNUpwardRPTracker::recede(const MachineInstr &MI) { + assert(MRI && "call reset first"); + + LastTrackedMI = &MI; + + if (MI.isDebugValue()) + return; + + // process all defs first to ensure early clobbers are handled correctly + // iterating over operands() to catch implicit defs + for (const auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef() || + !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + + auto Reg = MO.getReg(); + auto &LiveMask = LiveRegs[Reg]; + auto PrevMask = LiveMask; + LiveMask &= ~getDefRegMask(MO); + CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + } + + // then all uses + for (const auto &MO : MI.uses()) { + if (!MO.isReg() || !MO.readsReg() || + !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + + auto Reg = MO.getReg(); + auto &LiveMask = LiveRegs[Reg]; + auto PrevMask = LiveMask; + LiveMask |= getUsedRegMask(MO); + CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + } + + MaxPressure = max(MaxPressure, CurPressure); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, + const GCNRPTracker::LiveRegSet &TrackedLR, + const TargetRegisterInfo *TRI) { + for (auto const &P : TrackedLR) { + auto I = LISLR.find(P.first); + if (I == LISLR.end()) { + dbgs() << " " << PrintReg(P.first, TRI) + << ":L" << PrintLaneMask(P.second) + << " isn't found in LIS reported set\n"; + } + else if (I->second != P.second) { + dbgs() << " " << PrintReg(P.first, TRI) + << " masks doesn't match: LIS reported " + << PrintLaneMask(I->second) + << ", tracked " + << PrintLaneMask(P.second) + << '\n'; + } + } + for (auto const &P : LISLR) { + auto I = TrackedLR.find(P.first); + if (I == TrackedLR.end()) { + dbgs() << " " << PrintReg(P.first, TRI) + << ":L" << PrintLaneMask(P.second) + << " isn't found in tracked set\n"; + } + } +} + +bool GCNUpwardRPTracker::isValid() const { + const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex(); + const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI); + const auto TrackedLR = stripEmpty(LiveRegs); + + if (!isEqual(LISLR, TrackedLR)) { + dbgs() << "\nGCNUpwardRPTracker error: Tracked and" + " LIS reported livesets mismatch:\n"; + printLivesAt(SI, LIS, *MRI); + reportMismatch(LISLR, TrackedLR, MRI->getTargetRegisterInfo()); + return false; + } + + auto LISPressure = getRegPressure(*MRI, LISLR); + if (LISPressure != CurPressure) { + dbgs() << "GCNUpwardRPTracker error: Pressure sets different\nTracked: "; + CurPressure.print(dbgs()); + dbgs() << "LIS rpt: "; + LISPressure.print(dbgs()); + return false; + } + return true; +} + +#endif diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h new file mode 100644 index 000000000000..82e76a7bfddc --- /dev/null +++ b/lib/Target/AMDGPU/GCNRegPressure.h @@ -0,0 +1,170 @@ +//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H +#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H + +#include "AMDGPUSubtarget.h" + +#include <limits> + +namespace llvm { + +struct GCNRegPressure { + enum RegKind { + SGPR32, + SGPR_TUPLE, + VGPR32, + VGPR_TUPLE, + TOTAL_KINDS + }; + + GCNRegPressure() { + clear(); + } + + bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; } + + void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } + + unsigned getSGRPNum() const { return Value[SGPR32]; } + unsigned getVGRPNum() const { return Value[VGPR32]; } + + unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; } + unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; } + + unsigned getOccupancy(const SISubtarget &ST) const { + return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()), + ST.getOccupancyWithNumVGPRs(getVGRPNum())); + } + + void inc(unsigned Reg, + LaneBitmask PrevMask, + LaneBitmask NewMask, + const MachineRegisterInfo &MRI); + + bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const { + return getOccupancy(ST) > O.getOccupancy(ST); + } + + bool less(const SISubtarget &ST, const GCNRegPressure& O, + unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const; + + bool operator==(const GCNRegPressure &O) const { + return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value); + } + + bool operator!=(const GCNRegPressure &O) const { + return !(*this == O); + } + + void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const; + void dump() const { print(dbgs()); } + +private: + unsigned Value[TOTAL_KINDS]; + + static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI); + + friend GCNRegPressure max(const GCNRegPressure &P1, + const GCNRegPressure &P2); +}; + +inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) { + GCNRegPressure Res; + for (unsigned I = 0; I < GCNRegPressure::TOTAL_KINDS; ++I) + Res.Value[I] = std::max(P1.Value[I], P2.Value[I]); + return Res; +} + +class GCNRPTracker { +public: + typedef DenseMap<unsigned, LaneBitmask> LiveRegSet; + +protected: + LiveRegSet LiveRegs; + GCNRegPressure CurPressure, MaxPressure; + const MachineInstr *LastTrackedMI = nullptr; + mutable const MachineRegisterInfo *MRI = nullptr; + GCNRPTracker() {} +public: + // live regs for the current state + const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } + const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } + + // returns MaxPressure, resetting it + decltype(MaxPressure) moveMaxPressure() { + auto Res = MaxPressure; + MaxPressure.clear(); + return Res; + } + decltype(LiveRegs) moveLiveRegs() { + return std::move(LiveRegs); + } +}; + +class GCNUpwardRPTracker : public GCNRPTracker { + const LiveIntervals &LIS; + LaneBitmask getDefRegMask(const MachineOperand &MO) const; + LaneBitmask getUsedRegMask(const MachineOperand &MO) const; +public: + GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} + // reset tracker to the point just below MI + // filling live regs upon this point using LIS + void reset(const MachineInstr &MI); + + // move to the state just above the MI + void recede(const MachineInstr &MI); + + // checks whether the tracker's state after receding MI corresponds + // to reported by LIS + bool isValid() const; +}; + +LaneBitmask getLiveLaneMask(unsigned Reg, + SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI, + const LiveIntervals &LIS) { + return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS, + MI.getParent()->getParent()->getRegInfo()); +} + +inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, + const LiveIntervals &LIS) { + return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS, + MI.getParent()->getParent()->getRegInfo()); +} + +template <typename Range> +GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, + Range &&LiveRegs) { + GCNRegPressure Res; + for (const auto &RM : LiveRegs) + Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI); + return Res; +} + +void printLivesAt(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +} // End namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 2f88033c807f..ea305a92fc60 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -18,6 +18,7 @@ #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/Support/MathExtras.h" #define DEBUG_TYPE "misched" @@ -25,7 +26,7 @@ using namespace llvm; GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C) : - GenericScheduler(C) { } + GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { } static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs, const MachineFunction &MF) { @@ -35,18 +36,46 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs, unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs), ST.getOccupancyWithNumVGPRs(VGPRs)); return std::min(MinRegOccupancy, - ST.getOccupancyWithLocalMemSize(MFI->getLDSSize())); + ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), + *MF.getFunction())); +} + +void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { + GenericScheduler::initialize(DAG); + + const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); + + MF = &DAG->MF; + + const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + + // FIXME: This is also necessary, because some passes that run after + // scheduling and before regalloc increase register pressure. + const int ErrorMargin = 3; + + SGPRExcessLimit = Context->RegClassInfo + ->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass) - ErrorMargin; + VGPRExcessLimit = Context->RegClassInfo + ->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass) - ErrorMargin; + if (TargetOccupancy) { + SGPRCriticalLimit = ST.getMaxNumSGPRs(TargetOccupancy, true); + VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy); + } else { + SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF, + SRI->getSGPRPressureSet()); + VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF, + SRI->getVGPRPressureSet()); + } + + SGPRCriticalLimit -= ErrorMargin; + VGPRCriticalLimit -= ErrorMargin; } void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, - int SGPRPressure, - int VGPRPressure, - int SGPRExcessLimit, - int VGPRExcessLimit, - int SGPRCriticalLimit, - int VGPRCriticalLimit) { + unsigned SGPRPressure, + unsigned VGPRPressure) { Cand.SU = SU; Cand.AtTop = AtTop; @@ -66,8 +95,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); } - int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()]; - int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()]; + unsigned NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()]; + unsigned NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()]; // If two instructions increase the pressure of different register sets // by the same amount, the generic scheduler will prefer to schedule the @@ -77,7 +106,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // only for VGPRs or only for SGPRs. // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. - const int MaxVGPRPressureInc = 16; + const unsigned MaxVGPRPressureInc = 16; bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; @@ -86,11 +115,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // to increase the likelihood we don't go over the limits. We should improve // the analysis to look through dependencies to find the path with the least // register pressure. - // FIXME: This is also necessary, because some passes that run after - // scheduling and before regalloc increase register pressure. - const int ErrorMargin = 3; - VGPRExcessLimit -= ErrorMargin; - SGPRExcessLimit -= ErrorMargin; // We only need to update the RPDelata for instructions that increase // register pressure. Instructions that decrease or keep reg pressure @@ -103,7 +127,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet()); - Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit); + Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); } // Register pressure is considered 'CRITICAL' if it is approaching a value @@ -111,9 +135,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both // has the same cost, so we don't need to prefer one over the other. - VGPRCriticalLimit -= ErrorMargin; - SGPRCriticalLimit -= ErrorMargin; - int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; @@ -134,27 +155,16 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand) { - const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>(); const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()]; unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()]; - unsigned SGPRExcessLimit = - Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); - unsigned VGPRExcessLimit = - Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); - unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF); - unsigned SGPRCriticalLimit = SRI->getMaxNumSGPRs(ST, MaxWaves, true); - unsigned VGPRCriticalLimit = SRI->getMaxNumVGPRs(MaxWaves); - ReadyQueue &Q = Zone.Available; for (SUnit *SU : Q) { SchedCandidate TryCand(ZonePolicy); initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, - SGPRPressure, VGPRPressure, - SGPRExcessLimit, VGPRExcessLimit, - SGPRCriticalLimit, VGPRCriticalLimit); + SGPRPressure, VGPRPressure); // Pass SchedBoundary only when comparing nodes from the same boundary. SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); @@ -167,16 +177,6 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, } } -static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) { - switch (Reason) { - default: - return Reason; - case GenericSchedulerBase::RegCritical: - case GenericSchedulerBase::RegExcess: - return -Reason; - } -} - // This function is mostly cut and pasted from // GenericScheduler::pickNodeBidirectional() SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { @@ -224,9 +224,9 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // Pick best from BotCand and TopCand. DEBUG( dbgs() << "Top Cand: "; - traceCandidate(BotCand); - dbgs() << "Bot Cand: "; traceCandidate(TopCand); + dbgs() << "Bot Cand: "; + traceCandidate(BotCand); ); SchedCandidate Cand; if (TopCand.Reason == BotCand.Reason) { @@ -249,9 +249,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) { Cand = BotCand; } else { - int TopRank = getBidirectionalReasonRank(TopCand.Reason); - int BotRank = getBidirectionalReasonRank(BotCand.Reason); - if (TopRank > BotRank) { + if (BotCand.Reason > TopCand.Reason) { Cand = TopCand; } else { Cand = BotCand; @@ -310,3 +308,255 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); return SU; } + +GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, + std::unique_ptr<MachineSchedStrategy> S) : + ScheduleDAGMILive(C, std::move(S)), + ST(MF.getSubtarget<SISubtarget>()), + MFI(*MF.getInfo<SIMachineFunctionInfo>()), + StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(), + *MF.getFunction())), + MinOccupancy(StartingOccupancy), Stage(0) { + + DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); +} + +void GCNScheduleDAGMILive::schedule() { + std::vector<MachineInstr*> Unsched; + Unsched.reserve(NumRegionInstrs); + for (auto &I : *this) + Unsched.push_back(&I); + + std::pair<unsigned, unsigned> PressureBefore; + if (LIS) { + DEBUG(dbgs() << "Pressure before scheduling:\n"); + discoverLiveIns(); + PressureBefore = getRealRegPressure(); + } + + ScheduleDAGMILive::schedule(); + if (Stage == 0) + Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); + + if (!LIS) + return; + + // Check the results of scheduling. + GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; + DEBUG(dbgs() << "Pressure after scheduling:\n"); + auto PressureAfter = getRealRegPressure(); + LiveIns.clear(); + + if (PressureAfter.first <= S.SGPRCriticalLimit && + PressureAfter.second <= S.VGPRCriticalLimit) { + DEBUG(dbgs() << "Pressure in desired limits, done.\n"); + return; + } + unsigned WavesAfter = getMaxWaves(PressureAfter.first, + PressureAfter.second, MF); + unsigned WavesBefore = getMaxWaves(PressureBefore.first, + PressureBefore.second, MF); + DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << + ", after " << WavesAfter << ".\n"); + + // We could not keep current target occupancy because of the just scheduled + // region. Record new occupancy for next scheduling cycle. + unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); + if (NewOccupancy < MinOccupancy) { + MinOccupancy = NewOccupancy; + DEBUG(dbgs() << "Occupancy lowered for the function to " + << MinOccupancy << ".\n"); + } + + if (WavesAfter >= WavesBefore) + return; + + DEBUG(dbgs() << "Attempting to revert scheduling.\n"); + RegionEnd = RegionBegin; + for (MachineInstr *MI : Unsched) { + if (MI->getIterator() != RegionEnd) { + BB->remove(MI); + BB->insert(RegionEnd, MI); + LIS->handleMove(*MI, true); + } + // Reset read-undef flags and update them later. + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef()) + Op.setIsUndef(false); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); + if (ShouldTrackLaneMasks) { + // Adjust liveness and add missing dead+read-undef flags. + SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); + } else { + // Adjust for missing dead-def flags. + RegOpers.detectDeadDefs(*MI, *LIS); + } + RegionEnd = MI->getIterator(); + ++RegionEnd; + DEBUG(dbgs() << "Scheduling " << *MI); + } + RegionBegin = Unsched.front()->getIterator(); + if (Stage == 0) + Regions.back() = std::make_pair(RegionBegin, RegionEnd); + + placeDebugValues(); +} + +static inline void setMask(const MachineRegisterInfo &MRI, + const SIRegisterInfo *SRI, unsigned Reg, + LaneBitmask &PrevMask, LaneBitmask NewMask, + unsigned &SGPRs, unsigned &VGPRs) { + int NewRegs = countPopulation(NewMask.getAsInteger()) - + countPopulation(PrevMask.getAsInteger()); + if (SRI->isSGPRReg(MRI, Reg)) + SGPRs += NewRegs; + if (SRI->isVGPR(MRI, Reg)) + VGPRs += NewRegs; + assert ((int)SGPRs >= 0 && (int)VGPRs >= 0); + PrevMask = NewMask; +} + +void GCNScheduleDAGMILive::discoverLiveIns() { + unsigned SGPRs = 0; + unsigned VGPRs = 0; + + const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); + SlotIndex SI = LIS->getInstructionIndex(*begin()).getBaseIndex(); + assert (SI.isValid()); + + DEBUG(dbgs() << "Region live-ins:"); + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + if (MRI.reg_nodbg_empty(Reg)) + continue; + const LiveInterval &LI = LIS->getInterval(Reg); + LaneBitmask LaneMask = LaneBitmask::getNone(); + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) + if (S.liveAt(SI)) + LaneMask |= S.LaneMask; + } else if (LI.liveAt(SI)) { + LaneMask = MRI.getMaxLaneMaskForVReg(Reg); + } + + if (LaneMask.any()) { + setMask(MRI, SRI, Reg, LiveIns[Reg], LaneMask, SGPRs, VGPRs); + + DEBUG(dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':' + << PrintLaneMask(LiveIns[Reg])); + } + } + + LiveInPressure = std::make_pair(SGPRs, VGPRs); + + DEBUG(dbgs() << "\nLive-in pressure:\nSGPR = " << SGPRs + << "\nVGPR = " << VGPRs << '\n'); +} + +std::pair<unsigned, unsigned> +GCNScheduleDAGMILive::getRealRegPressure() const { + unsigned SGPRs, MaxSGPRs, VGPRs, MaxVGPRs; + SGPRs = MaxSGPRs = LiveInPressure.first; + VGPRs = MaxVGPRs = LiveInPressure.second; + + const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); + DenseMap<unsigned, LaneBitmask> LiveRegs(LiveIns); + + for (const MachineInstr &MI : *this) { + if (MI.isDebugValue()) + continue; + SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex(); + assert (SI.isValid()); + + // Remove dead registers or mask bits. + for (auto &It : LiveRegs) { + if (It.second.none()) + continue; + const LiveInterval &LI = LIS->getInterval(It.first); + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) + if (!S.liveAt(SI)) + setMask(MRI, SRI, It.first, It.second, It.second & ~S.LaneMask, + SGPRs, VGPRs); + } else if (!LI.liveAt(SI)) { + setMask(MRI, SRI, It.first, It.second, LaneBitmask::getNone(), + SGPRs, VGPRs); + } + } + + // Add new registers or mask bits. + for (const auto &MO : MI.defs()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + unsigned SubRegIdx = MO.getSubReg(); + LaneBitmask LaneMask = SubRegIdx != 0 + ? TRI->getSubRegIndexLaneMask(SubRegIdx) + : MRI.getMaxLaneMaskForVReg(Reg); + LaneBitmask &LM = LiveRegs[Reg]; + setMask(MRI, SRI, Reg, LM, LM | LaneMask, SGPRs, VGPRs); + } + MaxSGPRs = std::max(MaxSGPRs, SGPRs); + MaxVGPRs = std::max(MaxVGPRs, VGPRs); + } + + DEBUG(dbgs() << "Real region's register pressure:\nSGPR = " << MaxSGPRs + << "\nVGPR = " << MaxVGPRs << '\n'); + + return std::make_pair(MaxSGPRs, MaxVGPRs); +} + +void GCNScheduleDAGMILive::finalizeSchedule() { + // Retry function scheduling if we found resulting occupancy and it is + // lower than used for first pass scheduling. This will give more freedom + // to schedule low register pressure blocks. + // Code is partially copied from MachineSchedulerBase::scheduleRegions(). + + if (!LIS || StartingOccupancy <= MinOccupancy) + return; + + DEBUG(dbgs() << "Retrying function scheduling with lowest recorded occupancy " + << MinOccupancy << ".\n"); + + Stage++; + GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; + S.setTargetOccupancy(MinOccupancy); + + MachineBasicBlock *MBB = nullptr; + for (auto Region : Regions) { + RegionBegin = Region.first; + RegionEnd = Region.second; + + if (RegionBegin->getParent() != MBB) { + if (MBB) finishBlock(); + MBB = RegionBegin->getParent(); + startBlock(MBB); + } + + unsigned NumRegionInstrs = std::distance(begin(), end()); + enterRegion(MBB, begin(), end(), NumRegionInstrs); + + // Skip empty scheduling regions (0 or 1 schedulable instructions). + if (begin() == end() || begin() == std::prev(end())) { + exitRegion(); + continue; + } + DEBUG(dbgs() << "********** MI Scheduling **********\n"); + DEBUG(dbgs() << MF.getName() + << ":BB#" << MBB->getNumber() << " " << MBB->getName() + << "\n From: " << *begin() << " To: "; + if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; + else dbgs() << "End"; + dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); + + schedule(); + + exitRegion(); + } + finishBlock(); + LiveIns.shrink_and_clear(); +} diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h index 4cfc0cea81fb..15af232704ff 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -18,13 +18,16 @@ namespace llvm { +class SIMachineFunctionInfo; class SIRegisterInfo; +class SISubtarget; /// This is a minimal scheduler strategy. The main difference between this /// and the GenericScheduler is that GCNSchedStrategy uses different /// heuristics to determine excess/critical pressure sets. Its goal is to /// maximize kernel occupancy (i.e. maximum number of waves per simd). class GCNMaxOccupancySchedStrategy : public GenericScheduler { + friend class GCNScheduleDAGMILive; SUnit *pickNodeBidirectional(bool &IsTopNode); @@ -35,18 +38,65 @@ class GCNMaxOccupancySchedStrategy : public GenericScheduler { void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, - int SGPRPressure, int VGPRPressure, - int SGPRExcessLimit, int VGPRExcessLimit, - int SGPRCriticalLimit, int VGPRCriticalLimit); + unsigned SGPRPressure, unsigned VGPRPressure); - void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary *Zone, const SIRegisterInfo *SRI, - unsigned SGPRPressure, unsigned VGPRPressure); + unsigned SGPRExcessLimit; + unsigned VGPRExcessLimit; + unsigned SGPRCriticalLimit; + unsigned VGPRCriticalLimit; + + unsigned TargetOccupancy; + + MachineFunction *MF; public: GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); SUnit *pickNode(bool &IsTopNode) override; + + void initialize(ScheduleDAGMI *DAG) override; + + void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; } +}; + +class GCNScheduleDAGMILive : public ScheduleDAGMILive { + + const SISubtarget &ST; + + const SIMachineFunctionInfo &MFI; + + // Occupancy target at the begining of function scheduling cycle. + unsigned StartingOccupancy; + + // Minimal real occupancy recorder for the function. + unsigned MinOccupancy; + + // Scheduling stage number. + unsigned Stage; + + // Vecor of regions recorder for later rescheduling + SmallVector<std::pair<MachineBasicBlock::iterator, + MachineBasicBlock::iterator>, 32> Regions; + + // Region live-ins. + DenseMap<unsigned, LaneBitmask> LiveIns; + + // Number of live-ins to the current region, first SGPR then VGPR. + std::pair<unsigned, unsigned> LiveInPressure; + + // Collect current region live-ins. + void discoverLiveIns(); + + // Return current region pressure. First value is SGPR number, second is VGPR. + std::pair<unsigned, unsigned> getRealRegPressure() const; + +public: + GCNScheduleDAGMILive(MachineSchedContext *C, + std::unique_ptr<MachineSchedStrategy> S); + + void schedule() override; + + void finalizeSchedule() override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index 7172a0aa7167..a817ff3cbaf0 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -113,7 +113,7 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O) { uint16_t Imm = MI->getOperand(OpNo).getImm(); if (Imm != 0) { - O << " offset:"; + O << ((OpNo == 0)? "offset:" : " offset:"); printU16ImmDecOperand(MI, OpNo, O); } } @@ -375,6 +375,14 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, O << formatHex(static_cast<uint64_t>(Imm)); } +void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Lo16 = static_cast<uint16_t>(Imm); + assert(Lo16 == static_cast<uint16_t>(Imm >> 16)); + printImmediate16(Lo16, STI, O); +} + void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -489,6 +497,10 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_IMM_FP16: printImmediate16(Op.getImm(), STI, O); break; + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + printImmediateV216(Op.getImm(), STI, O); + break; case MCOI::OPERAND_UNKNOWN: case MCOI::OPERAND_PCREL: O << formatDec(Op.getImm()); @@ -531,13 +543,34 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned InputModifiers = MI->getOperand(OpNo).getImm(); - if (InputModifiers & SISrcMods::NEG) - O << '-'; + + // Use 'neg(...)' instead of '-' to avoid ambiguity. + // This is important for integer literals because + // -1 is not the same value as neg(1). + bool NegMnemo = false; + + if (InputModifiers & SISrcMods::NEG) { + if (OpNo + 1 < MI->getNumOperands() && + (InputModifiers & SISrcMods::ABS) == 0) { + const MCOperand &Op = MI->getOperand(OpNo + 1); + NegMnemo = Op.isImm() || Op.isFPImm(); + } + if (NegMnemo) { + O << "neg("; + } else { + O << '-'; + } + } + if (InputModifiers & SISrcMods::ABS) O << '|'; printOperand(MI, OpNo + 1, STI, O); if (InputModifiers & SISrcMods::ABS) O << '|'; + + if (NegMnemo) { + O << ')'; + } } void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, @@ -672,11 +705,19 @@ template <unsigned N> void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - int EnIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::en); + unsigned Opc = MI->getOpcode(); + int EnIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::en); unsigned En = MI->getOperand(EnIdx).getImm(); - // FIXME: What do we do with compr? The meaning of en changes depending on if - // compr is set. + int ComprIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::compr); + + // If compr is set, print as src0, src0, src1, src1 + if (MI->getOperand(ComprIdx).getImm()) { + if (N == 1 || N == 2) + --OpNo; + else if (N == 3) + OpNo -= 2; + } if (En & (1 << N)) printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI); @@ -730,6 +771,71 @@ void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo, } } +static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod) { + int DefaultValue = (Mod == SISrcMods::OP_SEL_1); + + for (int I = 0; I < NumOps; ++I) { + if (!!(Ops[I] & Mod) != DefaultValue) + return false; + } + + return true; +} + +static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod, + raw_ostream &O) { + unsigned Opc = MI->getOpcode(); + int NumOps = 0; + int Ops[3]; + + for (int OpName : { AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers }) { + int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName); + if (Idx == -1) + break; + + Ops[NumOps++] = MI->getOperand(Idx).getImm(); + } + + if (allOpsDefaultValue(Ops, NumOps, Mod)) + return; + + O << Name; + for (int I = 0; I < NumOps; ++I) { + if (I != 0) + O << ','; + + O << !!(Ops[I] & Mod); + } + + O << ']'; +} + +void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O); +} + +void AMDGPUInstPrinter::printOpSelHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " op_sel_hi:[", SISrcMods::OP_SEL_1, O); +} + +void AMDGPUInstPrinter::printNegLo(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " neg_lo:[", SISrcMods::NEG, O); +} + +void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O); +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -1057,27 +1163,28 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - IsaVersion IV = getIsaVersion(STI.getFeatureBits()); + AMDGPU::IsaInfo::IsaVersion ISA = + AMDGPU::IsaInfo::getIsaVersion(STI.getFeatureBits()); unsigned SImm16 = MI->getOperand(OpNo).getImm(); unsigned Vmcnt, Expcnt, Lgkmcnt; - decodeWaitcnt(IV, SImm16, Vmcnt, Expcnt, Lgkmcnt); + decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt); bool NeedSpace = false; - if (Vmcnt != getVmcntBitMask(IV)) { + if (Vmcnt != getVmcntBitMask(ISA)) { O << "vmcnt(" << Vmcnt << ')'; NeedSpace = true; } - if (Expcnt != getExpcntBitMask(IV)) { + if (Expcnt != getExpcntBitMask(ISA)) { if (NeedSpace) O << ' '; O << "expcnt(" << Expcnt << ')'; NeedSpace = true; } - if (Lgkmcnt != getLgkmcntBitMask(IV)) { + if (Lgkmcnt != getLgkmcntBitMask(ISA)) { if (NeedSpace) O << ' '; O << "lgkmcnt(" << Lgkmcnt << ')'; diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index a6d348ff0f12..c0b8e5c51089 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -90,6 +90,8 @@ private: raw_ostream &O); void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); + void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, @@ -117,6 +119,14 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printSDWADstUnused(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printOpSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printOpSelHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNegLo(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNegHi(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt index bbdd17737cf0..c54a13c4b4d8 100644 --- a/lib/Target/AMDGPU/LLVMBuild.txt +++ b/lib/Target/AMDGPU/LLVMBuild.txt @@ -30,5 +30,5 @@ has_disassembler = 1 type = Library name = AMDGPUCodeGen parent = AMDGPU -required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize +required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index ffb92aae599e..f3266fe82955 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -37,7 +37,7 @@ public: bool &IsResolved) override; void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override; + uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override { @@ -131,7 +131,7 @@ void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm, void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, - bool IsPCRel) const { + bool IsPCRel, MCContext &Ctx) const { if (!Value) return; // Doesn't change encoding. @@ -164,7 +164,20 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( } bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { - OW->WriteZeros(Count); + // If the count is not 4-byte aligned, we must be writing data into the text + // section (otherwise we have unaligned instructions, and thus have far + // bigger problems), so just write zeros instead. + OW->WriteZeros(Count % 4); + + // We are properly aligned, so write NOPs as requested. + Count /= 4; + + // FIXME: R600 support. + // s_nop 0 + const uint32_t Encoded_S_NOP_0 = 0xbf800000; + + for (uint64_t I = 0; I != Count; ++I) + OW->write32(Encoded_S_NOP_0); return true; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadata.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadata.h new file mode 100644 index 000000000000..816e8c744b27 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadata.h @@ -0,0 +1,422 @@ +//===--- AMDGPUCodeObjectMetadata.h -----------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Code Object Metadata definitions and in-memory +/// representations. +/// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATA_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATA_H + +#include <cstdint> +#include <string> +#include <system_error> +#include <vector> + +namespace llvm { +namespace AMDGPU { + +//===----------------------------------------------------------------------===// +// Code Object Metadata. +//===----------------------------------------------------------------------===// +namespace CodeObject { + +/// \brief Code object metadata major version. +constexpr uint32_t MetadataVersionMajor = 1; +/// \brief Code object metadata minor version. +constexpr uint32_t MetadataVersionMinor = 0; + +/// \brief Code object metadata beginning assembler directive. +constexpr char MetadataAssemblerDirectiveBegin[] = + ".amdgpu_code_object_metadata"; +/// \brief Code object metadata ending assembler directive. +constexpr char MetadataAssemblerDirectiveEnd[] = + ".end_amdgpu_code_object_metadata"; + +/// \brief Access qualifiers. +enum class AccessQualifier : uint8_t { + Default = 0, + ReadOnly = 1, + WriteOnly = 2, + ReadWrite = 3, + Unknown = 0xff +}; + +/// \brief Address space qualifiers. +enum class AddressSpaceQualifier : uint8_t { + Private = 0, + Global = 1, + Constant = 2, + Local = 3, + Generic = 4, + Region = 5, + Unknown = 0xff +}; + +/// \brief Value kinds. +enum class ValueKind : uint8_t { + ByValue = 0, + GlobalBuffer = 1, + DynamicSharedPointer = 2, + Sampler = 3, + Image = 4, + Pipe = 5, + Queue = 6, + HiddenGlobalOffsetX = 7, + HiddenGlobalOffsetY = 8, + HiddenGlobalOffsetZ = 9, + HiddenNone = 10, + HiddenPrintfBuffer = 11, + HiddenDefaultQueue = 12, + HiddenCompletionAction = 13, + Unknown = 0xff +}; + +/// \brief Value types. +enum class ValueType : uint8_t { + Struct = 0, + I8 = 1, + U8 = 2, + I16 = 3, + U16 = 4, + F16 = 5, + I32 = 6, + U32 = 7, + F32 = 8, + I64 = 9, + U64 = 10, + F64 = 11, + Unknown = 0xff +}; + +//===----------------------------------------------------------------------===// +// Kernel Metadata. +//===----------------------------------------------------------------------===// +namespace Kernel { + +//===----------------------------------------------------------------------===// +// Kernel Attributes Metadata. +//===----------------------------------------------------------------------===// +namespace Attrs { + +namespace Key { +/// \brief Key for Kernel::Attr::Metadata::mReqdWorkGroupSize. +constexpr char ReqdWorkGroupSize[] = "ReqdWorkGroupSize"; +/// \brief Key for Kernel::Attr::Metadata::mWorkGroupSizeHint. +constexpr char WorkGroupSizeHint[] = "WorkGroupSizeHint"; +/// \brief Key for Kernel::Attr::Metadata::mVecTypeHint. +constexpr char VecTypeHint[] = "VecTypeHint"; +} // end namespace Key + +/// \brief In-memory representation of kernel attributes metadata. +struct Metadata final { + /// \brief 'reqd_work_group_size' attribute. Optional. + std::vector<uint32_t> mReqdWorkGroupSize = std::vector<uint32_t>(); + /// \brief 'work_group_size_hint' attribute. Optional. + std::vector<uint32_t> mWorkGroupSizeHint = std::vector<uint32_t>(); + /// \brief 'vec_type_hint' attribute. Optional. + std::string mVecTypeHint = std::string(); + + /// \brief Default constructor. + Metadata() = default; + + /// \returns True if kernel attributes metadata is empty, false otherwise. + bool empty() const { + return mReqdWorkGroupSize.empty() && + mWorkGroupSizeHint.empty() && + mVecTypeHint.empty(); + } + + /// \returns True if kernel attributes metadata is not empty, false otherwise. + bool notEmpty() const { + return !empty(); + } +}; + +} // end namespace Attrs + +//===----------------------------------------------------------------------===// +// Kernel Argument Metadata. +//===----------------------------------------------------------------------===// +namespace Arg { + +namespace Key { +/// \brief Key for Kernel::Arg::Metadata::mSize. +constexpr char Size[] = "Size"; +/// \brief Key for Kernel::Arg::Metadata::mAlign. +constexpr char Align[] = "Align"; +/// \brief Key for Kernel::Arg::Metadata::mValueKind. +constexpr char ValueKind[] = "ValueKind"; +/// \brief Key for Kernel::Arg::Metadata::mValueType. +constexpr char ValueType[] = "ValueType"; +/// \brief Key for Kernel::Arg::Metadata::mPointeeAlign. +constexpr char PointeeAlign[] = "PointeeAlign"; +/// \brief Key for Kernel::Arg::Metadata::mAccQual. +constexpr char AccQual[] = "AccQual"; +/// \brief Key for Kernel::Arg::Metadata::mAddrSpaceQual. +constexpr char AddrSpaceQual[] = "AddrSpaceQual"; +/// \brief Key for Kernel::Arg::Metadata::mIsConst. +constexpr char IsConst[] = "IsConst"; +/// \brief Key for Kernel::Arg::Metadata::mIsPipe. +constexpr char IsPipe[] = "IsPipe"; +/// \brief Key for Kernel::Arg::Metadata::mIsRestrict. +constexpr char IsRestrict[] = "IsRestrict"; +/// \brief Key for Kernel::Arg::Metadata::mIsVolatile. +constexpr char IsVolatile[] = "IsVolatile"; +/// \brief Key for Kernel::Arg::Metadata::mName. +constexpr char Name[] = "Name"; +/// \brief Key for Kernel::Arg::Metadata::mTypeName. +constexpr char TypeName[] = "TypeName"; +} // end namespace Key + +/// \brief In-memory representation of kernel argument metadata. +struct Metadata final { + /// \brief Size in bytes. Required. + uint32_t mSize = 0; + /// \brief Alignment in bytes. Required. + uint32_t mAlign = 0; + /// \brief Value kind. Required. + ValueKind mValueKind = ValueKind::Unknown; + /// \brief Value type. Required. + ValueType mValueType = ValueType::Unknown; + /// \brief Pointee alignment in bytes. Optional. + uint32_t mPointeeAlign = 0; + /// \brief Access qualifier. Optional. + AccessQualifier mAccQual = AccessQualifier::Unknown; + /// \brief Address space qualifier. Optional. + AddressSpaceQualifier mAddrSpaceQual = AddressSpaceQualifier::Unknown; + /// \brief True if 'const' qualifier is specified. Optional. + bool mIsConst = false; + /// \brief True if 'pipe' qualifier is specified. Optional. + bool mIsPipe = false; + /// \brief True if 'restrict' qualifier is specified. Optional. + bool mIsRestrict = false; + /// \brief True if 'volatile' qualifier is specified. Optional. + bool mIsVolatile = false; + /// \brief Name. Optional. + std::string mName = std::string(); + /// \brief Type name. Optional. + std::string mTypeName = std::string(); + + /// \brief Default constructor. + Metadata() = default; +}; + +} // end namespace Arg + +//===----------------------------------------------------------------------===// +// Kernel Code Properties Metadata. +//===----------------------------------------------------------------------===// +namespace CodeProps { + +namespace Key { +/// \brief Key for Kernel::CodeProps::Metadata::mKernargSegmentSize. +constexpr char KernargSegmentSize[] = "KernargSegmentSize"; +/// \brief Key for Kernel::CodeProps::Metadata::mWorkgroupGroupSegmentSize. +constexpr char WorkgroupGroupSegmentSize[] = "WorkgroupGroupSegmentSize"; +/// \brief Key for Kernel::CodeProps::Metadata::mWorkitemPrivateSegmentSize. +constexpr char WorkitemPrivateSegmentSize[] = "WorkitemPrivateSegmentSize"; +/// \brief Key for Kernel::CodeProps::Metadata::mWavefrontNumSGPRs. +constexpr char WavefrontNumSGPRs[] = "WavefrontNumSGPRs"; +/// \brief Key for Kernel::CodeProps::Metadata::mWorkitemNumVGPRs. +constexpr char WorkitemNumVGPRs[] = "WorkitemNumVGPRs"; +/// \brief Key for Kernel::CodeProps::Metadata::mKernargSegmentAlign. +constexpr char KernargSegmentAlign[] = "KernargSegmentAlign"; +/// \brief Key for Kernel::CodeProps::Metadata::mGroupSegmentAlign. +constexpr char GroupSegmentAlign[] = "GroupSegmentAlign"; +/// \brief Key for Kernel::CodeProps::Metadata::mPrivateSegmentAlign. +constexpr char PrivateSegmentAlign[] = "PrivateSegmentAlign"; +/// \brief Key for Kernel::CodeProps::Metadata::mWavefrontSize. +constexpr char WavefrontSize[] = "WavefrontSize"; +} // end namespace Key + +/// \brief In-memory representation of kernel code properties metadata. +struct Metadata final { + /// \brief Size in bytes of the kernarg segment memory. Kernarg segment memory + /// holds the values of the arguments to the kernel. Optional. + uint64_t mKernargSegmentSize = 0; + /// \brief Size in bytes of the group segment memory required by a workgroup. + /// This value does not include any dynamically allocated group segment memory + /// that may be added when the kernel is dispatched. Optional. + uint32_t mWorkgroupGroupSegmentSize = 0; + /// \brief Size in bytes of the private segment memory required by a workitem. + /// Private segment memory includes arg, spill and private segments. Optional. + uint32_t mWorkitemPrivateSegmentSize = 0; + /// \brief Total number of SGPRs used by a wavefront. Optional. + uint16_t mWavefrontNumSGPRs = 0; + /// \brief Total number of VGPRs used by a workitem. Optional. + uint16_t mWorkitemNumVGPRs = 0; + /// \brief Maximum byte alignment of variables used by the kernel in the + /// kernarg memory segment. Expressed as a power of two. Optional. + uint8_t mKernargSegmentAlign = 0; + /// \brief Maximum byte alignment of variables used by the kernel in the + /// group memory segment. Expressed as a power of two. Optional. + uint8_t mGroupSegmentAlign = 0; + /// \brief Maximum byte alignment of variables used by the kernel in the + /// private memory segment. Expressed as a power of two. Optional. + uint8_t mPrivateSegmentAlign = 0; + /// \brief Wavefront size. Expressed as a power of two. Optional. + uint8_t mWavefrontSize = 0; + + /// \brief Default constructor. + Metadata() = default; + + /// \returns True if kernel code properties metadata is empty, false + /// otherwise. + bool empty() const { + return !notEmpty(); + } + + /// \returns True if kernel code properties metadata is not empty, false + /// otherwise. + bool notEmpty() const { + return mKernargSegmentSize || mWorkgroupGroupSegmentSize || + mWorkitemPrivateSegmentSize || mWavefrontNumSGPRs || + mWorkitemNumVGPRs || mKernargSegmentAlign || mGroupSegmentAlign || + mPrivateSegmentAlign || mWavefrontSize; + } +}; + +} // end namespace CodeProps + +//===----------------------------------------------------------------------===// +// Kernel Debug Properties Metadata. +//===----------------------------------------------------------------------===// +namespace DebugProps { + +namespace Key { +/// \brief Key for Kernel::DebugProps::Metadata::mDebuggerABIVersion. +constexpr char DebuggerABIVersion[] = "DebuggerABIVersion"; +/// \brief Key for Kernel::DebugProps::Metadata::mReservedNumVGPRs. +constexpr char ReservedNumVGPRs[] = "ReservedNumVGPRs"; +/// \brief Key for Kernel::DebugProps::Metadata::mReservedFirstVGPR. +constexpr char ReservedFirstVGPR[] = "ReservedFirstVGPR"; +/// \brief Key for Kernel::DebugProps::Metadata::mPrivateSegmentBufferSGPR. +constexpr char PrivateSegmentBufferSGPR[] = "PrivateSegmentBufferSGPR"; +/// \brief Key for +/// Kernel::DebugProps::Metadata::mWavefrontPrivateSegmentOffsetSGPR. +constexpr char WavefrontPrivateSegmentOffsetSGPR[] = + "WavefrontPrivateSegmentOffsetSGPR"; +} // end namespace Key + +/// \brief In-memory representation of kernel debug properties metadata. +struct Metadata final { + /// \brief Debugger ABI version. Optional. + std::vector<uint32_t> mDebuggerABIVersion = std::vector<uint32_t>(); + /// \brief Consecutive number of VGPRs reserved for debugger use. Must be 0 if + /// mDebuggerABIVersion is not set. Optional. + uint16_t mReservedNumVGPRs = 0; + /// \brief First fixed VGPR reserved. Must be uint16_t(-1) if + /// mDebuggerABIVersion is not set or mReservedFirstVGPR is 0. Optional. + uint16_t mReservedFirstVGPR = uint16_t(-1); + /// \brief Fixed SGPR of the first of 4 SGPRs used to hold the scratch V# used + /// for the entire kernel execution. Must be uint16_t(-1) if + /// mDebuggerABIVersion is not set or SGPR not used or not known. Optional. + uint16_t mPrivateSegmentBufferSGPR = uint16_t(-1); + /// \brief Fixed SGPR used to hold the wave scratch offset for the entire + /// kernel execution. Must be uint16_t(-1) if mDebuggerABIVersion is not set + /// or SGPR is not used or not known. Optional. + uint16_t mWavefrontPrivateSegmentOffsetSGPR = uint16_t(-1); + + /// \brief Default constructor. + Metadata() = default; + + /// \returns True if kernel debug properties metadata is empty, false + /// otherwise. + bool empty() const { + return !notEmpty(); + } + + /// \returns True if kernel debug properties metadata is not empty, false + /// otherwise. + bool notEmpty() const { + return !mDebuggerABIVersion.empty(); + } +}; + +} // end namespace DebugProps + +namespace Key { +/// \brief Key for Kernel::Metadata::mName. +constexpr char Name[] = "Name"; +/// \brief Key for Kernel::Metadata::mLanguage. +constexpr char Language[] = "Language"; +/// \brief Key for Kernel::Metadata::mLanguageVersion. +constexpr char LanguageVersion[] = "LanguageVersion"; +/// \brief Key for Kernel::Metadata::mAttrs. +constexpr char Attrs[] = "Attrs"; +/// \brief Key for Kernel::Metadata::mArgs. +constexpr char Args[] = "Args"; +/// \brief Key for Kernel::Metadata::mCodeProps. +constexpr char CodeProps[] = "CodeProps"; +/// \brief Key for Kernel::Metadata::mDebugProps. +constexpr char DebugProps[] = "DebugProps"; +} // end namespace Key + +/// \brief In-memory representation of kernel metadata. +struct Metadata final { + /// \brief Name. Required. + std::string mName = std::string(); + /// \brief Language. Optional. + std::string mLanguage = std::string(); + /// \brief Language version. Optional. + std::vector<uint32_t> mLanguageVersion = std::vector<uint32_t>(); + /// \brief Attributes metadata. Optional. + Attrs::Metadata mAttrs = Attrs::Metadata(); + /// \brief Arguments metadata. Optional. + std::vector<Arg::Metadata> mArgs = std::vector<Arg::Metadata>(); + /// \brief Code properties metadata. Optional. + CodeProps::Metadata mCodeProps = CodeProps::Metadata(); + /// \brief Debug properties metadata. Optional. + DebugProps::Metadata mDebugProps = DebugProps::Metadata(); + + /// \brief Default constructor. + Metadata() = default; +}; + +} // end namespace Kernel + +namespace Key { +/// \brief Key for CodeObject::Metadata::mVersion. +constexpr char Version[] = "Version"; +/// \brief Key for CodeObject::Metadata::mPrintf. +constexpr char Printf[] = "Printf"; +/// \brief Key for CodeObject::Metadata::mKernels. +constexpr char Kernels[] = "Kernels"; +} // end namespace Key + +/// \brief In-memory representation of code object metadata. +struct Metadata final { + /// \brief Code object metadata version. Required. + std::vector<uint32_t> mVersion = std::vector<uint32_t>(); + /// \brief Printf metadata. Optional. + std::vector<std::string> mPrintf = std::vector<std::string>(); + /// \brief Kernels metadata. Optional. + std::vector<Kernel::Metadata> mKernels = std::vector<Kernel::Metadata>(); + + /// \brief Default constructor. + Metadata() = default; + + /// \brief Converts \p YamlString to \p CodeObjectMetadata. + static std::error_code fromYamlString(std::string YamlString, + Metadata &CodeObjectMetadata); + + /// \brief Converts \p CodeObjectMetadata to \p YamlString. + static std::error_code toYamlString(Metadata CodeObjectMetadata, + std::string &YamlString); +}; + +} // end namespace CodeObject +} // end namespace AMDGPU +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATA_H diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp new file mode 100644 index 000000000000..29a6ab9fbe93 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp @@ -0,0 +1,625 @@ +//===--- AMDGPUCodeObjectMetadataStreamer.cpp -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Code Object Metadata Streamer. +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUCodeObjectMetadataStreamer.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/YAMLTraits.h" + +using namespace llvm::AMDGPU; +using namespace llvm::AMDGPU::CodeObject; + +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t) +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string) +LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Arg::Metadata) +LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata) + +namespace llvm { + +static cl::opt<bool> DumpCodeObjectMetadata( + "amdgpu-dump-comd", + cl::desc("Dump AMDGPU Code Object Metadata")); +static cl::opt<bool> VerifyCodeObjectMetadata( + "amdgpu-verify-comd", + cl::desc("Verify AMDGPU Code Object Metadata")); + +namespace yaml { + +template <> +struct ScalarEnumerationTraits<AccessQualifier> { + static void enumeration(IO &YIO, AccessQualifier &EN) { + YIO.enumCase(EN, "Default", AccessQualifier::Default); + YIO.enumCase(EN, "ReadOnly", AccessQualifier::ReadOnly); + YIO.enumCase(EN, "WriteOnly", AccessQualifier::WriteOnly); + YIO.enumCase(EN, "ReadWrite", AccessQualifier::ReadWrite); + } +}; + +template <> +struct ScalarEnumerationTraits<AddressSpaceQualifier> { + static void enumeration(IO &YIO, AddressSpaceQualifier &EN) { + YIO.enumCase(EN, "Private", AddressSpaceQualifier::Private); + YIO.enumCase(EN, "Global", AddressSpaceQualifier::Global); + YIO.enumCase(EN, "Constant", AddressSpaceQualifier::Constant); + YIO.enumCase(EN, "Local", AddressSpaceQualifier::Local); + YIO.enumCase(EN, "Generic", AddressSpaceQualifier::Generic); + YIO.enumCase(EN, "Region", AddressSpaceQualifier::Region); + } +}; + +template <> +struct ScalarEnumerationTraits<ValueKind> { + static void enumeration(IO &YIO, ValueKind &EN) { + YIO.enumCase(EN, "ByValue", ValueKind::ByValue); + YIO.enumCase(EN, "GlobalBuffer", ValueKind::GlobalBuffer); + YIO.enumCase(EN, "DynamicSharedPointer", ValueKind::DynamicSharedPointer); + YIO.enumCase(EN, "Sampler", ValueKind::Sampler); + YIO.enumCase(EN, "Image", ValueKind::Image); + YIO.enumCase(EN, "Pipe", ValueKind::Pipe); + YIO.enumCase(EN, "Queue", ValueKind::Queue); + YIO.enumCase(EN, "HiddenGlobalOffsetX", ValueKind::HiddenGlobalOffsetX); + YIO.enumCase(EN, "HiddenGlobalOffsetY", ValueKind::HiddenGlobalOffsetY); + YIO.enumCase(EN, "HiddenGlobalOffsetZ", ValueKind::HiddenGlobalOffsetZ); + YIO.enumCase(EN, "HiddenNone", ValueKind::HiddenNone); + YIO.enumCase(EN, "HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer); + YIO.enumCase(EN, "HiddenDefaultQueue", ValueKind::HiddenDefaultQueue); + YIO.enumCase(EN, "HiddenCompletionAction", + ValueKind::HiddenCompletionAction); + } +}; + +template <> +struct ScalarEnumerationTraits<ValueType> { + static void enumeration(IO &YIO, ValueType &EN) { + YIO.enumCase(EN, "Struct", ValueType::Struct); + YIO.enumCase(EN, "I8", ValueType::I8); + YIO.enumCase(EN, "U8", ValueType::U8); + YIO.enumCase(EN, "I16", ValueType::I16); + YIO.enumCase(EN, "U16", ValueType::U16); + YIO.enumCase(EN, "F16", ValueType::F16); + YIO.enumCase(EN, "I32", ValueType::I32); + YIO.enumCase(EN, "U32", ValueType::U32); + YIO.enumCase(EN, "F32", ValueType::F32); + YIO.enumCase(EN, "I64", ValueType::I64); + YIO.enumCase(EN, "U64", ValueType::U64); + YIO.enumCase(EN, "F64", ValueType::F64); + } +}; + +template <> +struct MappingTraits<Kernel::Attrs::Metadata> { + static void mapping(IO &YIO, Kernel::Attrs::Metadata &MD) { + YIO.mapOptional(Kernel::Attrs::Key::ReqdWorkGroupSize, + MD.mReqdWorkGroupSize, std::vector<uint32_t>()); + YIO.mapOptional(Kernel::Attrs::Key::WorkGroupSizeHint, + MD.mWorkGroupSizeHint, std::vector<uint32_t>()); + YIO.mapOptional(Kernel::Attrs::Key::VecTypeHint, + MD.mVecTypeHint, std::string()); + } +}; + +template <> +struct MappingTraits<Kernel::Arg::Metadata> { + static void mapping(IO &YIO, Kernel::Arg::Metadata &MD) { + YIO.mapRequired(Kernel::Arg::Key::Size, MD.mSize); + YIO.mapRequired(Kernel::Arg::Key::Align, MD.mAlign); + YIO.mapRequired(Kernel::Arg::Key::ValueKind, MD.mValueKind); + YIO.mapRequired(Kernel::Arg::Key::ValueType, MD.mValueType); + YIO.mapOptional(Kernel::Arg::Key::PointeeAlign, MD.mPointeeAlign, + uint32_t(0)); + YIO.mapOptional(Kernel::Arg::Key::AccQual, MD.mAccQual, + AccessQualifier::Unknown); + YIO.mapOptional(Kernel::Arg::Key::AddrSpaceQual, MD.mAddrSpaceQual, + AddressSpaceQualifier::Unknown); + YIO.mapOptional(Kernel::Arg::Key::IsConst, MD.mIsConst, false); + YIO.mapOptional(Kernel::Arg::Key::IsPipe, MD.mIsPipe, false); + YIO.mapOptional(Kernel::Arg::Key::IsRestrict, MD.mIsRestrict, false); + YIO.mapOptional(Kernel::Arg::Key::IsVolatile, MD.mIsVolatile, false); + YIO.mapOptional(Kernel::Arg::Key::Name, MD.mName, std::string()); + YIO.mapOptional(Kernel::Arg::Key::TypeName, MD.mTypeName, std::string()); + } +}; + +template <> +struct MappingTraits<Kernel::CodeProps::Metadata> { + static void mapping(IO &YIO, Kernel::CodeProps::Metadata &MD) { + YIO.mapOptional(Kernel::CodeProps::Key::KernargSegmentSize, + MD.mKernargSegmentSize, uint64_t(0)); + YIO.mapOptional(Kernel::CodeProps::Key::WorkgroupGroupSegmentSize, + MD.mWorkgroupGroupSegmentSize, uint32_t(0)); + YIO.mapOptional(Kernel::CodeProps::Key::WorkitemPrivateSegmentSize, + MD.mWorkitemPrivateSegmentSize, uint32_t(0)); + YIO.mapOptional(Kernel::CodeProps::Key::WavefrontNumSGPRs, + MD.mWavefrontNumSGPRs, uint16_t(0)); + YIO.mapOptional(Kernel::CodeProps::Key::WorkitemNumVGPRs, + MD.mWorkitemNumVGPRs, uint16_t(0)); + YIO.mapOptional(Kernel::CodeProps::Key::KernargSegmentAlign, + MD.mKernargSegmentAlign, uint8_t(0)); + YIO.mapOptional(Kernel::CodeProps::Key::GroupSegmentAlign, + MD.mGroupSegmentAlign, uint8_t(0)); + YIO.mapOptional(Kernel::CodeProps::Key::PrivateSegmentAlign, + MD.mPrivateSegmentAlign, uint8_t(0)); + YIO.mapOptional(Kernel::CodeProps::Key::WavefrontSize, + MD.mWavefrontSize, uint8_t(0)); + } +}; + +template <> +struct MappingTraits<Kernel::DebugProps::Metadata> { + static void mapping(IO &YIO, Kernel::DebugProps::Metadata &MD) { + YIO.mapOptional(Kernel::DebugProps::Key::DebuggerABIVersion, + MD.mDebuggerABIVersion, std::vector<uint32_t>()); + YIO.mapOptional(Kernel::DebugProps::Key::ReservedNumVGPRs, + MD.mReservedNumVGPRs, uint16_t(0)); + YIO.mapOptional(Kernel::DebugProps::Key::ReservedFirstVGPR, + MD.mReservedFirstVGPR, uint16_t(-1)); + YIO.mapOptional(Kernel::DebugProps::Key::PrivateSegmentBufferSGPR, + MD.mPrivateSegmentBufferSGPR, uint16_t(-1)); + YIO.mapOptional(Kernel::DebugProps::Key::WavefrontPrivateSegmentOffsetSGPR, + MD.mWavefrontPrivateSegmentOffsetSGPR, uint16_t(-1)); + } +}; + +template <> +struct MappingTraits<Kernel::Metadata> { + static void mapping(IO &YIO, Kernel::Metadata &MD) { + YIO.mapRequired(Kernel::Key::Name, MD.mName); + YIO.mapOptional(Kernel::Key::Language, MD.mLanguage, std::string()); + YIO.mapOptional(Kernel::Key::LanguageVersion, MD.mLanguageVersion, + std::vector<uint32_t>()); + if (!MD.mAttrs.empty() || !YIO.outputting()) + YIO.mapOptional(Kernel::Key::Attrs, MD.mAttrs); + if (!MD.mArgs.empty() || !YIO.outputting()) + YIO.mapOptional(Kernel::Key::Args, MD.mArgs); + if (!MD.mCodeProps.empty() || !YIO.outputting()) + YIO.mapOptional(Kernel::Key::CodeProps, MD.mCodeProps); + if (!MD.mDebugProps.empty() || !YIO.outputting()) + YIO.mapOptional(Kernel::Key::DebugProps, MD.mDebugProps); + } +}; + +template <> +struct MappingTraits<CodeObject::Metadata> { + static void mapping(IO &YIO, CodeObject::Metadata &MD) { + YIO.mapRequired(Key::Version, MD.mVersion); + YIO.mapOptional(Key::Printf, MD.mPrintf, std::vector<std::string>()); + if (!MD.mKernels.empty() || !YIO.outputting()) + YIO.mapOptional(Key::Kernels, MD.mKernels); + } +}; + +} // end namespace yaml + +namespace AMDGPU { + +/* static */ +std::error_code CodeObject::Metadata::fromYamlString( + std::string YamlString, CodeObject::Metadata &CodeObjectMetadata) { + yaml::Input YamlInput(YamlString); + YamlInput >> CodeObjectMetadata; + return YamlInput.error(); +} + +/* static */ +std::error_code CodeObject::Metadata::toYamlString( + CodeObject::Metadata CodeObjectMetadata, std::string &YamlString) { + raw_string_ostream YamlStream(YamlString); + yaml::Output YamlOutput(YamlStream, nullptr, std::numeric_limits<int>::max()); + YamlOutput << CodeObjectMetadata; + return std::error_code(); +} + +namespace CodeObject { + +void MetadataStreamer::dump(StringRef YamlString) const { + errs() << "AMDGPU Code Object Metadata:\n" << YamlString << '\n'; +} + +void MetadataStreamer::verify(StringRef YamlString) const { + errs() << "AMDGPU Code Object Metadata Parser Test: "; + + CodeObject::Metadata FromYamlString; + if (Metadata::fromYamlString(YamlString, FromYamlString)) { + errs() << "FAIL\n"; + return; + } + + std::string ToYamlString; + if (Metadata::toYamlString(FromYamlString, ToYamlString)) { + errs() << "FAIL\n"; + return; + } + + errs() << (YamlString == ToYamlString ? "PASS" : "FAIL") << '\n'; + if (YamlString != ToYamlString) { + errs() << "Original input: " << YamlString << '\n' + << "Produced output: " << ToYamlString << '\n'; + } +} + +AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const { + if (AccQual.empty()) + return AccessQualifier::Unknown; + + return StringSwitch<AccessQualifier>(AccQual) + .Case("read_only", AccessQualifier::ReadOnly) + .Case("write_only", AccessQualifier::WriteOnly) + .Case("read_write", AccessQualifier::ReadWrite) + .Default(AccessQualifier::Default); +} + +AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer( + unsigned AddressSpace) const { + if (AddressSpace == AMDGPUASI.PRIVATE_ADDRESS) + return AddressSpaceQualifier::Private; + if (AddressSpace == AMDGPUASI.GLOBAL_ADDRESS) + return AddressSpaceQualifier::Global; + if (AddressSpace == AMDGPUASI.CONSTANT_ADDRESS) + return AddressSpaceQualifier::Constant; + if (AddressSpace == AMDGPUASI.LOCAL_ADDRESS) + return AddressSpaceQualifier::Local; + if (AddressSpace == AMDGPUASI.FLAT_ADDRESS) + return AddressSpaceQualifier::Generic; + if (AddressSpace == AMDGPUASI.REGION_ADDRESS) + return AddressSpaceQualifier::Region; + + llvm_unreachable("Unknown address space qualifier"); +} + +ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual, + StringRef BaseTypeName) const { + if (TypeQual.find("pipe") != StringRef::npos) + return ValueKind::Pipe; + + return StringSwitch<ValueKind>(BaseTypeName) + .Case("sampler_t", ValueKind::Sampler) + .Case("queue_t", ValueKind::Queue) + .Cases("image1d_t", + "image1d_array_t", + "image1d_buffer_t", + "image2d_t" , + "image2d_array_t", + "image2d_array_depth_t", + "image2d_array_msaa_t" + "image2d_array_msaa_depth_t" + "image2d_depth_t", + "image2d_msaa_t", + "image2d_msaa_depth_t", + "image3d_t", ValueKind::Image) + .Default(isa<PointerType>(Ty) ? + (Ty->getPointerAddressSpace() == + AMDGPUASI.LOCAL_ADDRESS ? + ValueKind::DynamicSharedPointer : + ValueKind::GlobalBuffer) : + ValueKind::ByValue); +} + +ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const { + switch (Ty->getTypeID()) { + case Type::IntegerTyID: { + auto Signed = !TypeName.startswith("u"); + switch (Ty->getIntegerBitWidth()) { + case 8: + return Signed ? ValueType::I8 : ValueType::U8; + case 16: + return Signed ? ValueType::I16 : ValueType::U16; + case 32: + return Signed ? ValueType::I32 : ValueType::U32; + case 64: + return Signed ? ValueType::I64 : ValueType::U64; + default: + return ValueType::Struct; + } + } + case Type::HalfTyID: + return ValueType::F16; + case Type::FloatTyID: + return ValueType::F32; + case Type::DoubleTyID: + return ValueType::F64; + case Type::PointerTyID: + return getValueType(Ty->getPointerElementType(), TypeName); + case Type::VectorTyID: + return getValueType(Ty->getVectorElementType(), TypeName); + default: + return ValueType::Struct; + } +} + +std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const { + switch (Ty->getTypeID()) { + case Type::IntegerTyID: { + if (!Signed) + return (Twine('u') + getTypeName(Ty, true)).str(); + + auto BitWidth = Ty->getIntegerBitWidth(); + switch (BitWidth) { + case 8: + return "char"; + case 16: + return "short"; + case 32: + return "int"; + case 64: + return "long"; + default: + return (Twine('i') + Twine(BitWidth)).str(); + } + } + case Type::HalfTyID: + return "half"; + case Type::FloatTyID: + return "float"; + case Type::DoubleTyID: + return "double"; + case Type::VectorTyID: { + auto VecTy = cast<VectorType>(Ty); + auto ElTy = VecTy->getElementType(); + auto NumElements = VecTy->getVectorNumElements(); + return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str(); + } + default: + return "unknown"; + } +} + +std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions( + MDNode *Node) const { + std::vector<uint32_t> Dims; + if (Node->getNumOperands() != 3) + return Dims; + + for (auto &Op : Node->operands()) + Dims.push_back(mdconst::extract<ConstantInt>(Op)->getZExtValue()); + return Dims; +} + +void MetadataStreamer::emitVersion() { + auto &Version = CodeObjectMetadata.mVersion; + + Version.push_back(MetadataVersionMajor); + Version.push_back(MetadataVersionMinor); +} + +void MetadataStreamer::emitPrintf(const Module &Mod) { + auto &Printf = CodeObjectMetadata.mPrintf; + + auto Node = Mod.getNamedMetadata("llvm.printf.fmts"); + if (!Node) + return; + + for (auto Op : Node->operands()) + if (Op->getNumOperands()) + Printf.push_back(cast<MDString>(Op->getOperand(0))->getString()); +} + +void MetadataStreamer::emitKernelLanguage(const Function &Func) { + auto &Kernel = CodeObjectMetadata.mKernels.back(); + + // TODO: What about other languages? + auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version"); + if (!Node || !Node->getNumOperands()) + return; + auto Op0 = Node->getOperand(0); + if (Op0->getNumOperands() <= 1) + return; + + Kernel.mLanguage = "OpenCL C"; + Kernel.mLanguageVersion.push_back( + mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()); + Kernel.mLanguageVersion.push_back( + mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()); +} + +void MetadataStreamer::emitKernelAttrs(const Function &Func) { + auto &Attrs = CodeObjectMetadata.mKernels.back().mAttrs; + + if (auto Node = Func.getMetadata("reqd_work_group_size")) + Attrs.mReqdWorkGroupSize = getWorkGroupDimensions(Node); + if (auto Node = Func.getMetadata("work_group_size_hint")) + Attrs.mWorkGroupSizeHint = getWorkGroupDimensions(Node); + if (auto Node = Func.getMetadata("vec_type_hint")) { + Attrs.mVecTypeHint = getTypeName( + cast<ValueAsMetadata>(Node->getOperand(0))->getType(), + mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()); + } +} + +void MetadataStreamer::emitKernelArgs(const Function &Func) { + for (auto &Arg : Func.args()) + emitKernelArg(Arg); + + // TODO: What about other languages? + if (!Func.getParent()->getNamedMetadata("opencl.ocl.version")) + return; + + auto &DL = Func.getParent()->getDataLayout(); + auto Int64Ty = Type::getInt64Ty(Func.getContext()); + + emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX); + emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY); + emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); + + if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts")) + return; + + auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), + AMDGPUASI.GLOBAL_ADDRESS); + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); +} + +void MetadataStreamer::emitKernelArg(const Argument &Arg) { + auto Func = Arg.getParent(); + auto ArgNo = Arg.getArgNo(); + const MDNode *Node; + + StringRef TypeQual; + Node = Func->getMetadata("kernel_arg_type_qual"); + if (Node && ArgNo < Node->getNumOperands()) + TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + StringRef BaseTypeName; + Node = Func->getMetadata("kernel_arg_base_type"); + if (Node && ArgNo < Node->getNumOperands()) + BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + StringRef AccQual; + if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() && + Arg.hasNoAliasAttr()) { + AccQual = "read_only"; + } else { + Node = Func->getMetadata("kernel_arg_access_qual"); + if (Node && ArgNo < Node->getNumOperands()) + AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); + } + + StringRef Name; + Node = Func->getMetadata("kernel_arg_name"); + if (Node && ArgNo < Node->getNumOperands()) + Name = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + StringRef TypeName; + Node = Func->getMetadata("kernel_arg_type"); + if (Node && ArgNo < Node->getNumOperands()) + TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(), + getValueKind(Arg.getType(), TypeQual, BaseTypeName), TypeQual, + BaseTypeName, AccQual, Name, TypeName); +} + +void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, + ValueKind ValueKind, StringRef TypeQual, + StringRef BaseTypeName, StringRef AccQual, + StringRef Name, StringRef TypeName) { + CodeObjectMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata()); + auto &Arg = CodeObjectMetadata.mKernels.back().mArgs.back(); + + Arg.mSize = DL.getTypeAllocSize(Ty); + Arg.mAlign = DL.getABITypeAlignment(Ty); + Arg.mValueKind = ValueKind; + Arg.mValueType = getValueType(Ty, BaseTypeName); + + if (auto PtrTy = dyn_cast<PointerType>(Ty)) { + auto ElTy = PtrTy->getElementType(); + if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized()) + Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy); + } + + Arg.mAccQual = getAccessQualifier(AccQual); + + if (auto PtrTy = dyn_cast<PointerType>(Ty)) + Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace()); + + SmallVector<StringRef, 1> SplitTypeQuals; + TypeQual.split(SplitTypeQuals, " ", -1, false); + for (StringRef Key : SplitTypeQuals) { + auto P = StringSwitch<bool*>(Key) + .Case("const", &Arg.mIsConst) + .Case("pipe", &Arg.mIsPipe) + .Case("restrict", &Arg.mIsRestrict) + .Case("volatile", &Arg.mIsVolatile) + .Default(nullptr); + if (P) + *P = true; + } + + Arg.mName = Name; + Arg.mTypeName = TypeName; +} + +void MetadataStreamer::emitKernelCodeProps( + const amd_kernel_code_t &KernelCode) { + auto &CodeProps = CodeObjectMetadata.mKernels.back().mCodeProps; + + CodeProps.mKernargSegmentSize = KernelCode.kernarg_segment_byte_size; + CodeProps.mWorkgroupGroupSegmentSize = + KernelCode.workgroup_group_segment_byte_size; + CodeProps.mWorkitemPrivateSegmentSize = + KernelCode.workitem_private_segment_byte_size; + CodeProps.mWavefrontNumSGPRs = KernelCode.wavefront_sgpr_count; + CodeProps.mWorkitemNumVGPRs = KernelCode.workitem_vgpr_count; + CodeProps.mKernargSegmentAlign = KernelCode.kernarg_segment_alignment; + CodeProps.mGroupSegmentAlign = KernelCode.group_segment_alignment; + CodeProps.mPrivateSegmentAlign = KernelCode.private_segment_alignment; + CodeProps.mWavefrontSize = KernelCode.wavefront_size; +} + +void MetadataStreamer::emitKernelDebugProps( + const amd_kernel_code_t &KernelCode) { + if (!(KernelCode.code_properties & AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED)) + return; + + auto &DebugProps = CodeObjectMetadata.mKernels.back().mDebugProps; + + // FIXME: Need to pass down debugger ABI version through features. This is ok + // for now because we only have one version. + DebugProps.mDebuggerABIVersion.push_back(1); + DebugProps.mDebuggerABIVersion.push_back(0); + DebugProps.mReservedNumVGPRs = KernelCode.reserved_vgpr_count; + DebugProps.mReservedFirstVGPR = KernelCode.reserved_vgpr_first; + DebugProps.mPrivateSegmentBufferSGPR = + KernelCode.debug_private_segment_buffer_sgpr; + DebugProps.mWavefrontPrivateSegmentOffsetSGPR = + KernelCode.debug_wavefront_private_segment_offset_sgpr; +} + +void MetadataStreamer::begin(const Module &Mod) { + AMDGPUASI = getAMDGPUAS(Mod); + emitVersion(); + emitPrintf(Mod); +} + +void MetadataStreamer::emitKernel(const Function &Func, + const amd_kernel_code_t &KernelCode) { + if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL) + return; + + CodeObjectMetadata.mKernels.push_back(Kernel::Metadata()); + auto &Kernel = CodeObjectMetadata.mKernels.back(); + + Kernel.mName = Func.getName(); + emitKernelLanguage(Func); + emitKernelAttrs(Func); + emitKernelArgs(Func); + emitKernelCodeProps(KernelCode); + emitKernelDebugProps(KernelCode); +} + +ErrorOr<std::string> MetadataStreamer::toYamlString() { + std::string YamlString; + if (auto Error = Metadata::toYamlString(CodeObjectMetadata, YamlString)) + return Error; + + if (DumpCodeObjectMetadata) + dump(YamlString); + if (VerifyCodeObjectMetadata) + verify(YamlString); + + return YamlString; +} + +ErrorOr<std::string> MetadataStreamer::toYamlString(StringRef YamlString) { + if (auto Error = Metadata::fromYamlString(YamlString, CodeObjectMetadata)) + return Error; + + return toYamlString(); +} + +} // end namespace CodeObject +} // end namespace AMDGPU +} // end namespace llvm diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h new file mode 100644 index 000000000000..8d4c51763f63 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h @@ -0,0 +1,99 @@ +//===--- AMDGPUCodeObjectMetadataStreamer.h ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Code Object Metadata Streamer. +/// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H + +#include "AMDGPU.h" +#include "AMDGPUCodeObjectMetadata.h" +#include "AMDKernelCodeT.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/ErrorOr.h" + +namespace llvm { + +class Argument; +class DataLayout; +class Function; +class MDNode; +class Module; +class Type; + +namespace AMDGPU { +namespace CodeObject { + +class MetadataStreamer final { +private: + Metadata CodeObjectMetadata; + AMDGPUAS AMDGPUASI; + + void dump(StringRef YamlString) const; + + void verify(StringRef YamlString) const; + + AccessQualifier getAccessQualifier(StringRef AccQual) const; + + AddressSpaceQualifier getAddressSpaceQualifer(unsigned AddressSpace) const; + + ValueKind getValueKind(Type *Ty, StringRef TypeQual, + StringRef BaseTypeName) const; + + ValueType getValueType(Type *Ty, StringRef TypeName) const; + + std::string getTypeName(Type *Ty, bool Signed) const; + + std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const; + + void emitVersion(); + + void emitPrintf(const Module &Mod); + + void emitKernelLanguage(const Function &Func); + + void emitKernelAttrs(const Function &Func); + + void emitKernelArgs(const Function &Func); + + void emitKernelArg(const Argument &Arg); + + void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind, + StringRef TypeQual = "", StringRef BaseTypeName = "", + StringRef AccQual = "", StringRef Name = "", + StringRef TypeName = ""); + + void emitKernelCodeProps(const amd_kernel_code_t &KernelCode); + + void emitKernelDebugProps(const amd_kernel_code_t &KernelCode); + +public: + MetadataStreamer() = default; + ~MetadataStreamer() = default; + + void begin(const Module &Mod); + + void end() {} + + void emitKernel(const Function &Func, const amd_kernel_code_t &KernelCode); + + ErrorOr<std::string> toYamlString(); + + ErrorOr<std::string> toYamlString(StringRef YamlString); +}; + +} // end namespace CodeObject +} // end namespace AMDGPU +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 1847d7a67328..073d19422e86 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -1,16 +1,20 @@ -//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==// +//===- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -/// \file //===----------------------------------------------------------------------===// #include "AMDGPUMCTargetDesc.h" #include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" using namespace llvm; @@ -19,20 +23,21 @@ namespace { class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { public: AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend); + protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; }; -} // End anonymous namespace +} // end anonymous namespace AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend) : MCELFObjectTargetWriter(Is64Bit, ELF::ELFOSABI_AMDGPU_HSA, ELF::EM_AMDGPU, - HasRelocationAddend) { } + HasRelocationAddend) {} unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, @@ -77,7 +82,6 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, llvm_unreachable("unhandled relocation type"); } - MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend, raw_pwrite_stream &OS) { diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 548bad56e174..f80b5f3a6dba 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -54,11 +54,17 @@ MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit, #define GET_REGINFO_ENUM #include "AMDGPUGenRegisterInfo.inc" +#undef GET_REGINFO_ENUM #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_OPERAND_ENUM #include "AMDGPUGenInstrInfo.inc" +#undef GET_INSTRINFO_OPERAND_ENUM +#undef GET_INSTRINFO_ENUM + #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" +#undef GET_SUBTARGETINFO_ENUM #endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp deleted file mode 100644 index 95387ad1627c..000000000000 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp +++ /dev/null @@ -1,408 +0,0 @@ -//===-- AMDGPURuntimeMD.cpp - Generates runtime metadata ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// Generates AMDGPU runtime metadata for YAML mapping. -// -//===----------------------------------------------------------------------===// -// - -#include "AMDGPU.h" -#include "AMDGPURuntimeMetadata.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/YAMLTraits.h" -#include <vector> -#include "AMDGPURuntimeMD.h" - -using namespace llvm; -using namespace ::AMDGPU::RuntimeMD; - -static cl::opt<bool> -DumpRuntimeMD("amdgpu-dump-rtmd", - cl::desc("Dump AMDGPU runtime metadata")); - -static cl::opt<bool> -CheckRuntimeMDParser("amdgpu-check-rtmd-parser", cl::Hidden, - cl::desc("Check AMDGPU runtime metadata YAML parser")); - -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t) -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t) -LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string) -LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata) -LLVM_YAML_IS_SEQUENCE_VECTOR(KernelArg::Metadata) - -namespace llvm { -namespace yaml { - -template <> struct MappingTraits<KernelArg::Metadata> { - static void mapping(IO &YamlIO, KernelArg::Metadata &A) { - YamlIO.mapRequired(KeyName::ArgSize, A.Size); - YamlIO.mapRequired(KeyName::ArgAlign, A.Align); - YamlIO.mapOptional(KeyName::ArgPointeeAlign, A.PointeeAlign, 0U); - YamlIO.mapRequired(KeyName::ArgKind, A.Kind); - YamlIO.mapRequired(KeyName::ArgValueType, A.ValueType); - YamlIO.mapOptional(KeyName::ArgTypeName, A.TypeName, std::string()); - YamlIO.mapOptional(KeyName::ArgName, A.Name, std::string()); - YamlIO.mapOptional(KeyName::ArgAddrQual, A.AddrQual, INVALID_ADDR_QUAL); - YamlIO.mapOptional(KeyName::ArgAccQual, A.AccQual, INVALID_ACC_QUAL); - YamlIO.mapOptional(KeyName::ArgIsVolatile, A.IsVolatile, uint8_t(0)); - YamlIO.mapOptional(KeyName::ArgIsConst, A.IsConst, uint8_t(0)); - YamlIO.mapOptional(KeyName::ArgIsRestrict, A.IsRestrict, uint8_t(0)); - YamlIO.mapOptional(KeyName::ArgIsPipe, A.IsPipe, uint8_t(0)); - } - static const bool flow = true; -}; - -template <> struct MappingTraits<Kernel::Metadata> { - static void mapping(IO &YamlIO, Kernel::Metadata &K) { - YamlIO.mapRequired(KeyName::KernelName, K.Name); - YamlIO.mapOptional(KeyName::Language, K.Language, std::string()); - YamlIO.mapOptional(KeyName::LanguageVersion, K.LanguageVersion); - YamlIO.mapOptional(KeyName::ReqdWorkGroupSize, K.ReqdWorkGroupSize); - YamlIO.mapOptional(KeyName::WorkGroupSizeHint, K.WorkGroupSizeHint); - YamlIO.mapOptional(KeyName::VecTypeHint, K.VecTypeHint, std::string()); - YamlIO.mapOptional(KeyName::KernelIndex, K.KernelIndex, - INVALID_KERNEL_INDEX); - YamlIO.mapOptional(KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups, - uint8_t(0)); - YamlIO.mapRequired(KeyName::Args, K.Args); - } - static const bool flow = true; -}; - -template <> struct MappingTraits<Program::Metadata> { - static void mapping(IO &YamlIO, Program::Metadata &Prog) { - YamlIO.mapRequired(KeyName::MDVersion, Prog.MDVersionSeq); - YamlIO.mapOptional(KeyName::PrintfInfo, Prog.PrintfInfo); - YamlIO.mapOptional(KeyName::Kernels, Prog.Kernels); - } - static const bool flow = true; -}; - -} // end namespace yaml -} // end namespace llvm - -// Get a vector of three integer values from MDNode \p Node; -static std::vector<uint32_t> getThreeInt32(MDNode *Node) { - assert(Node->getNumOperands() == 3); - std::vector<uint32_t> V; - for (const MDOperand &Op : Node->operands()) { - const ConstantInt *CI = mdconst::extract<ConstantInt>(Op); - V.push_back(CI->getZExtValue()); - } - return V; -} - -static std::string getOCLTypeName(Type *Ty, bool Signed) { - switch (Ty->getTypeID()) { - case Type::HalfTyID: - return "half"; - case Type::FloatTyID: - return "float"; - case Type::DoubleTyID: - return "double"; - case Type::IntegerTyID: { - if (!Signed) - return (Twine('u') + getOCLTypeName(Ty, true)).str(); - unsigned BW = Ty->getIntegerBitWidth(); - switch (BW) { - case 8: - return "char"; - case 16: - return "short"; - case 32: - return "int"; - case 64: - return "long"; - default: - return (Twine('i') + Twine(BW)).str(); - } - } - case Type::VectorTyID: { - VectorType *VecTy = cast<VectorType>(Ty); - Type *EleTy = VecTy->getElementType(); - unsigned Size = VecTy->getVectorNumElements(); - return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str(); - } - default: - return "unknown"; - } -} - -static KernelArg::ValueType getRuntimeMDValueType( - Type *Ty, StringRef TypeName) { - switch (Ty->getTypeID()) { - case Type::HalfTyID: - return KernelArg::F16; - case Type::FloatTyID: - return KernelArg::F32; - case Type::DoubleTyID: - return KernelArg::F64; - case Type::IntegerTyID: { - bool Signed = !TypeName.startswith("u"); - switch (Ty->getIntegerBitWidth()) { - case 8: - return Signed ? KernelArg::I8 : KernelArg::U8; - case 16: - return Signed ? KernelArg::I16 : KernelArg::U16; - case 32: - return Signed ? KernelArg::I32 : KernelArg::U32; - case 64: - return Signed ? KernelArg::I64 : KernelArg::U64; - default: - // Runtime does not recognize other integer types. Report as struct type. - return KernelArg::Struct; - } - } - case Type::VectorTyID: - return getRuntimeMDValueType(Ty->getVectorElementType(), TypeName); - case Type::PointerTyID: - return getRuntimeMDValueType(Ty->getPointerElementType(), TypeName); - default: - return KernelArg::Struct; - } -} - -static KernelArg::AddressSpaceQualifer getRuntimeAddrSpace( - AMDGPUAS::AddressSpaces A) { - switch (A) { - case AMDGPUAS::GLOBAL_ADDRESS: - return KernelArg::Global; - case AMDGPUAS::CONSTANT_ADDRESS: - return KernelArg::Constant; - case AMDGPUAS::LOCAL_ADDRESS: - return KernelArg::Local; - case AMDGPUAS::FLAT_ADDRESS: - return KernelArg::Generic; - case AMDGPUAS::REGION_ADDRESS: - return KernelArg::Region; - default: - return KernelArg::Private; - } -} - -static KernelArg::Metadata getRuntimeMDForKernelArg(const DataLayout &DL, - Type *T, KernelArg::Kind Kind, StringRef BaseTypeName = "", - StringRef TypeName = "", StringRef ArgName = "", StringRef TypeQual = "", - StringRef AccQual = "") { - - KernelArg::Metadata Arg; - - // Set ArgSize and ArgAlign. - Arg.Size = DL.getTypeAllocSize(T); - Arg.Align = DL.getABITypeAlignment(T); - if (auto PT = dyn_cast<PointerType>(T)) { - auto ET = PT->getElementType(); - if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && ET->isSized()) - Arg.PointeeAlign = DL.getABITypeAlignment(ET); - } - - // Set ArgTypeName. - Arg.TypeName = TypeName; - - // Set ArgName. - Arg.Name = ArgName; - - // Set ArgIsVolatile, ArgIsRestrict, ArgIsConst and ArgIsPipe. - SmallVector<StringRef, 1> SplitQ; - TypeQual.split(SplitQ, " ", -1, false /* Drop empty entry */); - - for (StringRef KeyName : SplitQ) { - auto *P = StringSwitch<uint8_t *>(KeyName) - .Case("volatile", &Arg.IsVolatile) - .Case("restrict", &Arg.IsRestrict) - .Case("const", &Arg.IsConst) - .Case("pipe", &Arg.IsPipe) - .Default(nullptr); - if (P) - *P = 1; - } - - // Set ArgKind. - Arg.Kind = Kind; - - // Set ArgValueType. - Arg.ValueType = getRuntimeMDValueType(T, BaseTypeName); - - // Set ArgAccQual. - if (!AccQual.empty()) { - Arg.AccQual = StringSwitch<KernelArg::AccessQualifer>(AccQual) - .Case("read_only", KernelArg::ReadOnly) - .Case("write_only", KernelArg::WriteOnly) - .Case("read_write", KernelArg::ReadWrite) - .Default(KernelArg::AccNone); - } - - // Set ArgAddrQual. - if (auto *PT = dyn_cast<PointerType>(T)) { - Arg.AddrQual = getRuntimeAddrSpace(static_cast<AMDGPUAS::AddressSpaces>( - PT->getAddressSpace())); - } - - return Arg; -} - -static Kernel::Metadata getRuntimeMDForKernel(const Function &F) { - Kernel::Metadata Kernel; - Kernel.Name = F.getName(); - auto &M = *F.getParent(); - - // Set Language and LanguageVersion. - if (auto MD = M.getNamedMetadata("opencl.ocl.version")) { - if (MD->getNumOperands() != 0) { - auto Node = MD->getOperand(0); - if (Node->getNumOperands() > 1) { - Kernel.Language = "OpenCL C"; - uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0)) - ->getZExtValue(); - uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1)) - ->getZExtValue(); - Kernel.LanguageVersion.push_back(Major); - Kernel.LanguageVersion.push_back(Minor); - } - } - } - - const DataLayout &DL = F.getParent()->getDataLayout(); - for (auto &Arg : F.args()) { - unsigned I = Arg.getArgNo(); - Type *T = Arg.getType(); - auto TypeName = dyn_cast<MDString>(F.getMetadata( - "kernel_arg_type")->getOperand(I))->getString(); - auto BaseTypeName = cast<MDString>(F.getMetadata( - "kernel_arg_base_type")->getOperand(I))->getString(); - StringRef ArgName; - if (auto ArgNameMD = F.getMetadata("kernel_arg_name")) - ArgName = cast<MDString>(ArgNameMD->getOperand(I))->getString(); - auto TypeQual = cast<MDString>(F.getMetadata( - "kernel_arg_type_qual")->getOperand(I))->getString(); - auto AccQual = cast<MDString>(F.getMetadata( - "kernel_arg_access_qual")->getOperand(I))->getString(); - KernelArg::Kind Kind; - if (TypeQual.find("pipe") != StringRef::npos) - Kind = KernelArg::Pipe; - else Kind = StringSwitch<KernelArg::Kind>(BaseTypeName) - .Case("sampler_t", KernelArg::Sampler) - .Case("queue_t", KernelArg::Queue) - .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t", - "image2d_t" , "image2d_array_t", KernelArg::Image) - .Cases("image2d_depth_t", "image2d_array_depth_t", - "image2d_msaa_t", "image2d_array_msaa_t", - "image2d_msaa_depth_t", KernelArg::Image) - .Cases("image2d_array_msaa_depth_t", "image3d_t", - KernelArg::Image) - .Default(isa<PointerType>(T) ? - (T->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ? - KernelArg::DynamicSharedPointer : - KernelArg::GlobalBuffer) : - KernelArg::ByValue); - Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, T, Kind, - BaseTypeName, TypeName, ArgName, TypeQual, AccQual)); - } - - // Emit hidden kernel arguments for OpenCL kernels. - if (F.getParent()->getNamedMetadata("opencl.ocl.version")) { - auto Int64T = Type::getInt64Ty(F.getContext()); - Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T, - KernelArg::HiddenGlobalOffsetX)); - Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T, - KernelArg::HiddenGlobalOffsetY)); - Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T, - KernelArg::HiddenGlobalOffsetZ)); - if (F.getParent()->getNamedMetadata("llvm.printf.fmts")) { - auto Int8PtrT = Type::getInt8PtrTy(F.getContext(), - KernelArg::Global); - Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int8PtrT, - KernelArg::HiddenPrintfBuffer)); - } - } - - // Set ReqdWorkGroupSize, WorkGroupSizeHint, and VecTypeHint. - if (auto RWGS = F.getMetadata("reqd_work_group_size")) - Kernel.ReqdWorkGroupSize = getThreeInt32(RWGS); - - if (auto WGSH = F.getMetadata("work_group_size_hint")) - Kernel.WorkGroupSizeHint = getThreeInt32(WGSH); - - if (auto VTH = F.getMetadata("vec_type_hint")) - Kernel.VecTypeHint = getOCLTypeName(cast<ValueAsMetadata>( - VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>( - VTH->getOperand(1))->getZExtValue()); - - return Kernel; -} - -Program::Metadata::Metadata(const std::string &YAML) { - yaml::Input Input(YAML); - Input >> *this; -} - -std::string Program::Metadata::toYAML(void) { - std::string Text; - raw_string_ostream Stream(Text); - yaml::Output Output(Stream, nullptr, INT_MAX /* do not wrap line */); - Output << *this; - return Stream.str(); -} - -Program::Metadata Program::Metadata::fromYAML(const std::string &S) { - return Program::Metadata(S); -} - -// Check if the YAML string can be parsed. -static void checkRuntimeMDYAMLString(const std::string &YAML) { - auto P = Program::Metadata::fromYAML(YAML); - auto S = P.toYAML(); - llvm::errs() << "AMDGPU runtime metadata parser test " - << (YAML == S ? "passes" : "fails") << ".\n"; - if (YAML != S) { - llvm::errs() << "First output: " << YAML << '\n' - << "Second output: " << S << '\n'; - } -} - -std::string llvm::getRuntimeMDYAMLString(Module &M) { - Program::Metadata Prog; - Prog.MDVersionSeq.push_back(MDVersion); - Prog.MDVersionSeq.push_back(MDRevision); - - // Set PrintfInfo. - if (auto MD = M.getNamedMetadata("llvm.printf.fmts")) { - for (unsigned I = 0; I < MD->getNumOperands(); ++I) { - auto Node = MD->getOperand(I); - if (Node->getNumOperands() > 0) - Prog.PrintfInfo.push_back(cast<MDString>(Node->getOperand(0)) - ->getString()); - } - } - - // Set Kernels. - for (auto &F: M.functions()) { - if (!F.getMetadata("kernel_arg_type")) - continue; - Prog.Kernels.emplace_back(getRuntimeMDForKernel(F)); - } - - auto YAML = Prog.toYAML(); - - if (DumpRuntimeMD) - llvm::errs() << "AMDGPU runtime metadata:\n" << YAML << '\n'; - - if (CheckRuntimeMDParser) - checkRuntimeMDYAMLString(YAML); - - return YAML; -} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h deleted file mode 100644 index a92fdd4bebc2..000000000000 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h +++ /dev/null @@ -1,26 +0,0 @@ -//===- AMDGPURuntimeMD.h - Generate runtime metadata ---------------*- C++ -*-// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares functions for generating runtime metadata. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H -#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H - -#include <string> - -namespace llvm { -class Module; - -// Get runtime metadata as YAML string. -std::string getRuntimeMDYAMLString(Module &M); - -} -#endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 3392183d33c3..8dc863f723e2 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -27,7 +27,6 @@ #include "llvm/MC/MCSectionELF.h" #include "llvm/Support/ELF.h" #include "llvm/Support/FormattedStream.h" -#include "AMDGPURuntimeMD.h" namespace llvm { #include "AMDGPUPTNote.h" @@ -36,9 +35,27 @@ namespace llvm { using namespace llvm; using namespace llvm::AMDGPU; +//===----------------------------------------------------------------------===// +// AMDGPUTargetStreamer +//===----------------------------------------------------------------------===// + AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} +void AMDGPUTargetStreamer::EmitStartOfCodeObjectMetadata(const Module &Mod) { + CodeObjectMetadataStreamer.begin(Mod); +} + +void AMDGPUTargetStreamer::EmitKernelCodeObjectMetadata( + const Function &Func, const amd_kernel_code_t &KernelCode) { + CodeObjectMetadataStreamer.emitKernel(Func, KernelCode); +} + +void AMDGPUTargetStreamer::EmitEndOfCodeObjectMetadata() { + CodeObjectMetadataStreamer.end(); + EmitCodeObjectMetadata(CodeObjectMetadataStreamer.toYamlString().get()); +} + //===----------------------------------------------------------------------===// // AMDGPUTargetAsmStreamer //===----------------------------------------------------------------------===// @@ -93,16 +110,16 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal( OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n'; } -void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(Module &M) { - OS << "\t.amdgpu_runtime_metadata\n"; - OS << getRuntimeMDYAMLString(M); - OS << "\n\t.end_amdgpu_runtime_metadata\n"; -} +bool AMDGPUTargetAsmStreamer::EmitCodeObjectMetadata(StringRef YamlString) { + auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString); + if (!VerifiedYamlString) + return false; -void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(StringRef Metadata) { - OS << "\t.amdgpu_runtime_metadata"; - OS << Metadata; - OS << "\t.end_amdgpu_runtime_metadata\n"; + OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin << '\n'; + OS << VerifiedYamlString.get(); + OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd << '\n'; + + return true; } //===----------------------------------------------------------------------===// @@ -116,22 +133,21 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { return static_cast<MCELFStreamer &>(Streamer); } -void -AMDGPUTargetELFStreamer::EmitAMDGPUNote(const MCExpr* DescSZ, - PT_NOTE::NoteType Type, - std::function<void(MCELFStreamer &)> EmitDesc) { +void AMDGPUTargetELFStreamer::EmitAMDGPUNote( + const MCExpr *DescSZ, ElfNote::NoteType Type, + function_ref<void(MCELFStreamer &)> EmitDesc) { auto &S = getStreamer(); auto &Context = S.getContext(); - auto NameSZ = sizeof(PT_NOTE::NoteName); + auto NameSZ = sizeof(ElfNote::NoteName); S.PushSection(); S.SwitchSection(Context.getELFSection( - PT_NOTE::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC)); + ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC)); S.EmitIntValue(NameSZ, 4); // namesz S.EmitValue(DescSZ, 4); // descz - S.EmitIntValue(Type, 4); // type - S.EmitBytes(StringRef(PT_NOTE::NoteName, NameSZ)); // name + S.EmitIntValue(Type, 4); // type + S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ)); // name S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 EmitDesc(S); // desc S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 @@ -144,7 +160,7 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major, EmitAMDGPUNote( MCConstantExpr::create(8, getContext()), - PT_NOTE::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, + ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS){ OS.EmitIntValue(Major, 4); OS.EmitIntValue(Minor, 4); @@ -160,14 +176,14 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, StringRef ArchName) { uint16_t VendorNameSize = VendorName.size() + 1; uint16_t ArchNameSize = ArchName.size() + 1; - + unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) + sizeof(Major) + sizeof(Minor) + sizeof(Stepping) + VendorNameSize + ArchNameSize; EmitAMDGPUNote( MCConstantExpr::create(DescSZ, getContext()), - PT_NOTE::NT_AMDGPU_HSA_ISA, + ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) { OS.EmitIntValue(VendorNameSize, 2); OS.EmitIntValue(ArchNameSize, 2); @@ -216,7 +232,11 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal( Symbol->setBinding(ELF::STB_GLOBAL); } -void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) { +bool AMDGPUTargetELFStreamer::EmitCodeObjectMetadata(StringRef YamlString) { + auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString); + if (!VerifiedYamlString) + return false; + // Create two labels to mark the beginning and end of the desc field // and a MCExpr to calculate the size of the desc field. auto &Context = getContext(); @@ -228,15 +248,13 @@ void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) { EmitAMDGPUNote( DescSZ, - PT_NOTE::NT_AMDGPU_HSA_RUNTIME_METADATA, + ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA, [&](MCELFStreamer &OS) { OS.EmitLabel(DescBegin); - OS.EmitBytes(Metadata); + OS.EmitBytes(VerifiedYamlString.get()); OS.EmitLabel(DescEnd); } ); -} -void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(Module &M) { - EmitRuntimeMetadata(getRuntimeMDYAMLString(M)); + return true; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index e2f20586903d..5c588bbded9c 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -10,6 +10,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#include "AMDGPUCodeObjectMetadataStreamer.h" #include "AMDKernelCodeT.h" #include "llvm/MC/MCStreamer.h" @@ -26,6 +27,7 @@ class Type; class AMDGPUTargetStreamer : public MCTargetStreamer { protected: + AMDGPU::CodeObject::MetadataStreamer CodeObjectMetadataStreamer; MCContext &getContext() const { return Streamer.getContext(); } public: @@ -46,12 +48,18 @@ public: virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0; - virtual void EmitRuntimeMetadata(Module &M) = 0; + virtual void EmitStartOfCodeObjectMetadata(const Module &Mod); - virtual void EmitRuntimeMetadata(StringRef Metadata) = 0; + virtual void EmitKernelCodeObjectMetadata( + const Function &Func, const amd_kernel_code_t &KernelCode); + + virtual void EmitEndOfCodeObjectMetadata(); + + /// \returns True on success, false on failure. + virtual bool EmitCodeObjectMetadata(StringRef YamlString) = 0; }; -class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer { +class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { formatted_raw_ostream &OS; public: AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); @@ -70,17 +78,16 @@ public: void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; - void EmitRuntimeMetadata(Module &M) override; - - void EmitRuntimeMetadata(StringRef Metadata) override; + /// \returns True on success, false on failure. + bool EmitCodeObjectMetadata(StringRef YamlString) override; }; -class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer { +class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { MCStreamer &Streamer; - void EmitAMDGPUNote(const MCExpr* DescSize, - AMDGPU::PT_NOTE::NoteType Type, - std::function<void(MCELFStreamer &)> EmitDesc); + void EmitAMDGPUNote(const MCExpr *DescSize, + AMDGPU::ElfNote::NoteType Type, + function_ref<void(MCELFStreamer &)> EmitDesc); public: AMDGPUTargetELFStreamer(MCStreamer &S); @@ -102,9 +109,8 @@ public: void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; - void EmitRuntimeMetadata(Module &M) override; - - void EmitRuntimeMetadata(StringRef Metadata) override; + /// \returns True on success, false on failure. + bool EmitCodeObjectMetadata(StringRef YamlString) override; }; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt index 8a6d00ce69ed..09e3efad10af 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt @@ -1,13 +1,12 @@ - add_llvm_library(LLVMAMDGPUDesc AMDGPUAsmBackend.cpp + AMDGPUCodeObjectMetadataStreamer.cpp AMDGPUELFObjectWriter.cpp AMDGPUELFStreamer.cpp + AMDGPUMCAsmInfo.cpp AMDGPUMCCodeEmitter.cpp AMDGPUMCTargetDesc.cpp - AMDGPUMCAsmInfo.cpp - AMDGPURuntimeMD.cpp AMDGPUTargetStreamer.cpp R600MCCodeEmitter.cpp SIMCCodeEmitter.cpp - ) +) diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 0c5bb0648a16..bda0928036fd 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -220,13 +220,35 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, Imm = MO.getImm(); } - switch (AMDGPU::getOperandSize(OpInfo)) { - case 4: + switch (OpInfo.OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: return getLit32Encoding(static_cast<uint32_t>(Imm), STI); - case 8: + + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: return getLit64Encoding(static_cast<uint64_t>(Imm), STI); - case 2: + + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + // FIXME Is this correct? What do inline immediates do on SI for f16 src + // which does not have f16 support? return getLit16Encoding(static_cast<uint16_t>(Imm), STI); + + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + uint16_t Lo16 = static_cast<uint16_t>(Imm); + assert(Lo16 == static_cast<uint16_t>(Imm >> 16)); + uint32_t Encoding = getLit16Encoding(Lo16, STI); + assert(Encoding != 255 && "packed constants can only be inline immediates"); + return Encoding; + } default: llvm_unreachable("invalid operand size"); } diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 46803e555711..a515eecc222a 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -475,106 +475,6 @@ class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat < sub0) >; -// ======= SI Image Intrinsics ================ - -// Image load -defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">; -defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">; -def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>; - -// Basic sample -defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">; -defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">; -defm : SampleRawPatterns<int_SI_image_sample_d, "IMAGE_SAMPLE_D">; -defm : SampleRawPatterns<int_SI_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">; -defm : SampleRawPatterns<int_SI_image_sample_l, "IMAGE_SAMPLE_L">; -defm : SampleRawPatterns<int_SI_image_sample_b, "IMAGE_SAMPLE_B">; -defm : SampleRawPatterns<int_SI_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">; -defm : SampleRawPatterns<int_SI_image_sample_lz, "IMAGE_SAMPLE_LZ">; -defm : SampleRawPatterns<int_SI_image_sample_cd, "IMAGE_SAMPLE_CD">; -defm : SampleRawPatterns<int_SI_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">; - -// Sample with comparison -defm : SampleRawPatterns<int_SI_image_sample_c, "IMAGE_SAMPLE_C">; -defm : SampleRawPatterns<int_SI_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">; -defm : SampleRawPatterns<int_SI_image_sample_c_d, "IMAGE_SAMPLE_C_D">; -defm : SampleRawPatterns<int_SI_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">; -defm : SampleRawPatterns<int_SI_image_sample_c_l, "IMAGE_SAMPLE_C_L">; -defm : SampleRawPatterns<int_SI_image_sample_c_b, "IMAGE_SAMPLE_C_B">; -defm : SampleRawPatterns<int_SI_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">; -defm : SampleRawPatterns<int_SI_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">; - -// Sample with offsets -defm : SampleRawPatterns<int_SI_image_sample_o, "IMAGE_SAMPLE_O">; -defm : SampleRawPatterns<int_SI_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_d_o, "IMAGE_SAMPLE_D_O">; -defm : SampleRawPatterns<int_SI_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_l_o, "IMAGE_SAMPLE_L_O">; -defm : SampleRawPatterns<int_SI_image_sample_b_o, "IMAGE_SAMPLE_B_O">; -defm : SampleRawPatterns<int_SI_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">; -defm : SampleRawPatterns<int_SI_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">; -defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">; - -// Sample with comparison and offsets -defm : SampleRawPatterns<int_SI_image_sample_c_o, "IMAGE_SAMPLE_C_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">; -defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">; - -// Gather opcodes -// Only the variants which make sense are defined. -def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V2, v2i32>; -def : SampleRawPattern<int_SI_gather4, IMAGE_GATHER4_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_cl, IMAGE_GATHER4_CL_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_l, IMAGE_GATHER4_L_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b, IMAGE_GATHER4_B_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b_cl, IMAGE_GATHER4_B_CL_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V2, v2i32>; -def : SampleRawPattern<int_SI_gather4_lz, IMAGE_GATHER4_LZ_V4_V4, v4i32>; - -def : SampleRawPattern<int_SI_gather4_c, IMAGE_GATHER4_C_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_cl, IMAGE_GATHER4_C_CL_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_l, IMAGE_GATHER4_C_L_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_b, IMAGE_GATHER4_C_B_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b_cl, IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_lz, IMAGE_GATHER4_C_LZ_V4_V4, v4i32>; - -def : SampleRawPattern<int_SI_gather4_o, IMAGE_GATHER4_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_cl_o, IMAGE_GATHER4_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_l_o, IMAGE_GATHER4_L_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_b_o, IMAGE_GATHER4_B_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_b_cl_o, IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_lz_o, IMAGE_GATHER4_LZ_O_V4_V4, v4i32>; - -def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_o, IMAGE_GATHER4_C_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_cl_o, IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_l_o, IMAGE_GATHER4_C_L_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b_o, IMAGE_GATHER4_C_B_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_b_cl_o, IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>; -def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>; -def : SampleRawPattern<int_SI_gather4_c_lz_o, IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>; - -def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>; -def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>; -def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>; - // ======= amdgcn Image Intrinsics ============== // Image load diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td index 3c07cc76b9a1..0e4eda982139 100644 --- a/lib/Target/AMDGPU/Processors.td +++ b/lib/Target/AMDGPU/Processors.td @@ -187,3 +187,10 @@ def : ProcessorModel<"gfx810", SIQuarterSpeedModel, [FeatureISAVersion8_1_0] >; +def : ProcessorModel<"gfx900", SIQuarterSpeedModel, + [FeatureGFX9, FeatureISAVersion9_0_0, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"gfx901", SIQuarterSpeedModel, + [FeatureGFX9, FeatureXNACK, FeatureISAVersion9_0_1, FeatureLDSBankCount32] +>; diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 45b36d3d3ebb..811b905588b4 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -19,10 +19,26 @@ #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <new> +#include <set> +#include <utility> +#include <vector> using namespace llvm; @@ -43,13 +59,12 @@ struct CFStack { std::vector<StackItem> BranchStack; std::vector<StackItem> LoopStack; unsigned MaxStackSize; - unsigned CurrentEntries; - unsigned CurrentSubEntries; + unsigned CurrentEntries = 0; + unsigned CurrentSubEntries = 0; CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st), // We need to reserve a stack entry for CALL_FS in vertex shaders. - MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0), - CurrentEntries(0), CurrentSubEntries(0) { } + MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0) {} unsigned getLoopDepth(); bool branchStackContains(CFStack::StackItem); @@ -198,9 +213,8 @@ void CFStack::popLoop() { } class R600ControlFlowFinalizer : public MachineFunctionPass { - private: - typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; + typedef std::pair<MachineInstr *, std::vector<MachineInstr *>> ClauseFile; enum ControlFlowInstruction { CF_TC, @@ -217,10 +231,10 @@ private: }; static char ID; - const R600InstrInfo *TII; - const R600RegisterInfo *TRI; + const R600InstrInfo *TII = nullptr; + const R600RegisterInfo *TRI = nullptr; unsigned MaxFetchInst; - const R600Subtarget *ST; + const R600Subtarget *ST = nullptr; bool IsTrivialInst(MachineInstr &MI) const { switch (MI.getOpcode()) { @@ -355,7 +369,7 @@ private: continue; int64_t Imm = Src.second; std::vector<MachineOperand *>::iterator It = - find_if(Lits, [&](MachineOperand *val) { + llvm::find_if(Lits, [&](MachineOperand *val) { return val->isImm() && (val->getImm() == Imm); }); @@ -485,8 +499,7 @@ private: } public: - R600ControlFlowFinalizer(TargetMachine &tm) - : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} + R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { ST = &MF.getSubtarget<R600Subtarget>(); @@ -501,7 +514,7 @@ public: ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; - std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; + std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack; std::vector<MachineInstr * > IfThenElseStack; if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), @@ -554,7 +567,7 @@ public: MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_WHILE_LOOP)) .addImm(1); - std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, + std::pair<unsigned, std::set<MachineInstr *>> Pair(CfCount, std::set<MachineInstr *>()); Pair.second.insert(MIb); LoopStack.push_back(std::move(Pair)); @@ -564,7 +577,7 @@ public: } case AMDGPU::ENDLOOP: { CFStack.popLoop(); - std::pair<unsigned, std::set<MachineInstr *> > Pair = + std::pair<unsigned, std::set<MachineInstr *>> Pair = std::move(LoopStack.back()); LoopStack.pop_back(); CounterPropagateAddr(Pair.second, CfCount); @@ -693,7 +706,6 @@ char R600ControlFlowFinalizer::ID = 0; } // end anonymous namespace - -llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { +FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { return new R600ControlFlowFinalizer(TM); } diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index 9a5db6ccc672..03fc1aff5ec1 100644 --- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -17,26 +17,37 @@ #include "AMDGPU.h" #include "R600Defines.h" #include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/Pass.h" +#include "llvm/Support/ErrorHandling.h" +#include <cassert> +#include <cstdint> +#include <utility> +#include <vector> using namespace llvm; namespace llvm { + void initializeR600EmitClauseMarkersPass(PassRegistry&); -} + +} // end namespace llvm namespace { class R600EmitClauseMarkers : public MachineFunctionPass { - private: - const R600InstrInfo *TII; - int Address; + const R600InstrInfo *TII = nullptr; + int Address = 0; unsigned OccupiedDwords(MachineInstr &MI) const { switch (MI.getOpcode()) { @@ -118,7 +129,7 @@ private: SubstituteKCacheBank(MachineInstr &MI, std::vector<std::pair<unsigned, unsigned>> &CachedConsts, bool UpdateInstr = true) const { - std::vector<std::pair<unsigned, unsigned> > UsedKCache; + std::vector<std::pair<unsigned, unsigned>> UsedKCache; if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4) return true; @@ -181,10 +192,11 @@ private: bool canClauseLocalKillFitInClause( unsigned AluInstCount, - std::vector<std::pair<unsigned, unsigned> > KCacheBanks, + std::vector<std::pair<unsigned, unsigned>> KCacheBanks, MachineBasicBlock::iterator Def, MachineBasicBlock::iterator BBEnd) { const R600RegisterInfo &TRI = TII->getRegisterInfo(); + //TODO: change this to defs? for (MachineInstr::const_mop_iterator MOI = Def->operands_begin(), MOE = Def->operands_end(); MOI != MOE; ++MOI) { @@ -207,15 +219,17 @@ private: if (AluInstCount >= TII->getMaxAlusPerClause()) return false; + // TODO: Is this true? kill flag appears to work OK below // Register kill flags have been cleared by the time we get to this // pass, but it is safe to assume that all uses of this register // occur in the same basic block as its definition, because // it is illegal for the scheduler to schedule them in // different blocks. - if (UseI->findRegisterUseOperandIdx(MOI->getReg())) + if (UseI->readsRegister(MOI->getReg())) LastUseCount = AluInstCount; - if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) + // Exit early if the current use kills the register + if (UseI != Def && UseI->killsRegister(MOI->getReg())) break; } if (LastUseCount) @@ -228,7 +242,7 @@ private: MachineBasicBlock::iterator MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { MachineBasicBlock::iterator ClauseHead = I; - std::vector<std::pair<unsigned, unsigned> > KCacheBanks; + std::vector<std::pair<unsigned, unsigned>> KCacheBanks; bool PushBeforeModifier = false; unsigned AluInstCount = 0; for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { @@ -294,8 +308,8 @@ private: public: static char ID; - R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { + R600EmitClauseMarkers() : MachineFunctionPass(ID) { initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); } @@ -310,9 +324,11 @@ public: if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU) continue; // BB was already parsed for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { - if (isALU(*I)) - I = MakeALUClause(MBB, I); - else + if (isALU(*I)) { + auto next = MakeALUClause(MBB, I); + assert(next != I); + I = next; + } else ++I; } } @@ -333,7 +349,6 @@ INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", "R600 Emit Clause Markters", false, false) -llvm::FunctionPass *llvm::createR600EmitClauseMarkers() { +FunctionPass *llvm::createR600EmitClauseMarkers() { return new R600EmitClauseMarkers(); } - diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp index 5813786abe01..1f01ad732e00 100644 --- a/lib/Target/AMDGPU/R600FrameLowering.cpp +++ b/lib/Target/AMDGPU/R600FrameLowering.cpp @@ -8,7 +8,43 @@ //==-----------------------------------------------------------------------===// #include "R600FrameLowering.h" +#include "AMDGPUSubtarget.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; R600FrameLowering::~R600FrameLowering() = default; + +/// \returns The number of registers allocated for \p FI. +int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const R600RegisterInfo *RI + = MF.getSubtarget<R600Subtarget>().getRegisterInfo(); + + // Fill in FrameReg output argument. + FrameReg = RI->getFrameRegister(MF); + + // Start the offset at 2 so we don't overwrite work group information. + // FIXME: We should only do this when the shader actually uses this + // information. + unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4); + int UpperBound = FI == -1 ? MFI.getNumObjects() : FI; + + for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) { + OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i)); + OffsetBytes += MFI.getObjectSize(i); + // Each register holds 4 bytes, so we must always align the offset to at + // least 4 bytes, so that 2 frame objects won't share the same register. + OffsetBytes = alignTo(OffsetBytes, 4); + } + + if (FI != -1) + OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI)); + + return OffsetBytes / (getStackWidth(MF) * 4); +} diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h index 874435f35ce4..142f70967eda 100644 --- a/lib/Target/AMDGPU/R600FrameLowering.h +++ b/lib/Target/AMDGPU/R600FrameLowering.h @@ -25,6 +25,8 @@ public: MachineBasicBlock &MBB) const override {} void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override {} + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 77fee4356b65..3590a9b05e1d 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -221,6 +221,15 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBE, VT, Expand); } + // LLVM will expand these to atomic_cmp_swap(0) + // and atomic_swap, respectively. + setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); + + // We need to custom lower some of the intrinsics + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setSchedulingPreference(Sched::Source); setTargetDAGCombine(ISD::FP_ROUND); @@ -266,7 +275,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode()))); for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { - NewMI.addOperand(MI.getOperand(i)); + NewMI.add(MI.getOperand(i)); } } else { return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); @@ -339,34 +348,34 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(1)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) .addImm(isEOP(I)); // Set End of program bit break; case AMDGPU::RAT_STORE_TYPED_eg: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(1)) - .addOperand(MI.getOperand(2)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) .addImm(isEOP(I)); // Set End of program bit break; case AMDGPU::BRANCH: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) - .addOperand(MI.getOperand(0)); + .add(MI.getOperand(0)); break; case AMDGPU::BRANCH_COND_f32: { MachineInstr *NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), AMDGPU::PREDICATE_BIT) - .addOperand(MI.getOperand(1)) + .add(MI.getOperand(1)) .addImm(AMDGPU::PRED_SETNE) .addImm(0); // Flags TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; } @@ -375,12 +384,12 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineInstr *NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), AMDGPU::PREDICATE_BIT) - .addOperand(MI.getOperand(1)) + .add(MI.getOperand(1)) .addImm(AMDGPU::PRED_SETNE_INT) .addImm(0); // Flags TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; } @@ -408,13 +417,13 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40; BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(1)) - .addOperand(MI.getOperand(2)) - .addOperand(MI.getOperand(3)) - .addOperand(MI.getOperand(4)) - .addOperand(MI.getOperand(5)) - .addOperand(MI.getOperand(6)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)) + .add(MI.getOperand(6)) .addImm(CfInst) .addImm(EOP); break; @@ -490,8 +499,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); - switch(IntrinsicID) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + switch (IntrinsicID) { case AMDGPUIntrinsic::r600_tex: case AMDGPUIntrinsic::r600_texc: { unsigned TextureOp; @@ -552,7 +560,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const } case Intrinsic::r600_implicitarg_ptr: { - MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); + MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS); uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); return DAG.getConstant(ByteOffset, DL, PtrVT); } @@ -599,6 +607,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_recipsqrt_clamped: return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); + default: + return Op; } // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) @@ -702,12 +712,12 @@ SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); - if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); const DataLayout &DL = DAG.getDataLayout(); const GlobalValue *GV = GSD->getGlobal(); - MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); @@ -864,7 +874,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, unsigned DwordOffset) const { unsigned ByteOffset = DwordOffset * 4; PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_BUFFER_0); + AMDGPUASI.CONSTANT_BUFFER_0); // We shouldn't be using an offset wider than 16-bits for implicit parameters. assert(isInt<16>(ByteOffset)); @@ -911,7 +921,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const if (VT == MVT::f32) { DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); - SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); + SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); if (MinMax) return MinMax; } @@ -1102,7 +1112,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, //TODO: Who creates the i8 stores? assert(Store->isTruncatingStore() || Store->getValue().getValueType() == MVT::i8); - assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS); + assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS); SDValue Mask; if (Store->getMemoryVT() == MVT::i8) { @@ -1200,9 +1210,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); // Neither LOCAL nor PRIVATE can do vectors at the moment - if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && + if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) && VT.isVector()) { - if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) { + if ((AS == AMDGPUASI.PRIVATE_ADDRESS) && + StoreNode->isTruncatingStore()) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); // TODO: can the chain be replaced without creating a new store? @@ -1225,7 +1236,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr, DAG.getConstant(2, DL, PtrVT)); - if (AS == AMDGPUAS::GLOBAL_ADDRESS) { + if (AS == AMDGPUASI.GLOBAL_ADDRESS) { // It is beneficial to create MSKOR here instead of combiner to avoid // artificial dependencies introduced by RMW if (StoreNode->isTruncatingStore()) { @@ -1278,7 +1289,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes - if (AS != AMDGPUAS::PRIVATE_ADDRESS) + if (AS != AMDGPUASI.PRIVATE_ADDRESS) return SDValue(); if (MemVT.bitsLT(MVT::i32)) @@ -1297,39 +1308,39 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // return (512 + (kc_bank << 12) static int -ConstantAddressBlock(unsigned AddressSpace) { +ConstantAddressBlock(unsigned AddressSpace, AMDGPUAS AMDGPUASI) { switch (AddressSpace) { - case AMDGPUAS::CONSTANT_BUFFER_0: + case AMDGPUASI.CONSTANT_BUFFER_0: return 512; - case AMDGPUAS::CONSTANT_BUFFER_1: + case AMDGPUASI.CONSTANT_BUFFER_1: return 512 + 4096; - case AMDGPUAS::CONSTANT_BUFFER_2: + case AMDGPUASI.CONSTANT_BUFFER_2: return 512 + 4096 * 2; - case AMDGPUAS::CONSTANT_BUFFER_3: + case AMDGPUASI.CONSTANT_BUFFER_3: return 512 + 4096 * 3; - case AMDGPUAS::CONSTANT_BUFFER_4: + case AMDGPUASI.CONSTANT_BUFFER_4: return 512 + 4096 * 4; - case AMDGPUAS::CONSTANT_BUFFER_5: + case AMDGPUASI.CONSTANT_BUFFER_5: return 512 + 4096 * 5; - case AMDGPUAS::CONSTANT_BUFFER_6: + case AMDGPUASI.CONSTANT_BUFFER_6: return 512 + 4096 * 6; - case AMDGPUAS::CONSTANT_BUFFER_7: + case AMDGPUASI.CONSTANT_BUFFER_7: return 512 + 4096 * 7; - case AMDGPUAS::CONSTANT_BUFFER_8: + case AMDGPUASI.CONSTANT_BUFFER_8: return 512 + 4096 * 8; - case AMDGPUAS::CONSTANT_BUFFER_9: + case AMDGPUASI.CONSTANT_BUFFER_9: return 512 + 4096 * 9; - case AMDGPUAS::CONSTANT_BUFFER_10: + case AMDGPUASI.CONSTANT_BUFFER_10: return 512 + 4096 * 10; - case AMDGPUAS::CONSTANT_BUFFER_11: + case AMDGPUASI.CONSTANT_BUFFER_11: return 512 + 4096 * 11; - case AMDGPUAS::CONSTANT_BUFFER_12: + case AMDGPUASI.CONSTANT_BUFFER_12: return 512 + 4096 * 12; - case AMDGPUAS::CONSTANT_BUFFER_13: + case AMDGPUASI.CONSTANT_BUFFER_13: return 512 + 4096 * 13; - case AMDGPUAS::CONSTANT_BUFFER_14: + case AMDGPUASI.CONSTANT_BUFFER_14: return 512 + 4096 * 14; - case AMDGPUAS::CONSTANT_BUFFER_15: + case AMDGPUASI.CONSTANT_BUFFER_15: return 512 + 4096 * 15; default: return -1; @@ -1397,7 +1408,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT MemVT = LoadNode->getMemoryVT(); ISD::LoadExtType ExtType = LoadNode->getExtensionType(); - if (AS == AMDGPUAS::PRIVATE_ADDRESS && + if (AS == AMDGPUASI.PRIVATE_ADDRESS && ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { return lowerPrivateExtLoad(Op, DAG); } @@ -1407,13 +1418,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = LoadNode->getChain(); SDValue Ptr = LoadNode->getBasePtr(); - if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS || + LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) && VT.isVector()) { return scalarizeVectorLoad(LoadNode, DAG); } - int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); + int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace(), + AMDGPUASI); if (ConstantBlock > -1 && ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { @@ -1445,7 +1457,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, DL, MVT::i32)), DAG.getConstant(LoadNode->getAddressSpace() - - AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) + AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32) ); } @@ -1481,7 +1493,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(MergedValues, DL); } - if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) { return SDValue(); } @@ -1535,7 +1547,7 @@ SDValue R600TargetLowering::LowerFormalArguments( SmallVector<ISD::InputArg, 8> LocalIns; if (AMDGPU::isShader(CallConv)) { - AnalyzeFormalArguments(CCInfo, Ins); + CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); } else { analyzeFormalArgumentsCompute(CCInfo, Ins); } @@ -1558,7 +1570,7 @@ SDValue R600TargetLowering::LowerFormalArguments( } PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_BUFFER_0); + AMDGPUASI.CONSTANT_BUFFER_0); // i64 isn't a legal type, so the register type used ends up as i32, which // isn't expected here. It attempts to create this sextload, but it ends up diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index e88bd076718e..2422d57269eb 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -12,16 +12,34 @@ // //===----------------------------------------------------------------------===// -#include "R600InstrInfo.h" #include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" #include "AMDGPUSubtarget.h" -#include "AMDGPUTargetMachine.h" #include "R600Defines.h" -#include "R600MachineFunctionInfo.h" +#include "R600FrameLowering.h" +#include "R600InstrInfo.h" #include "R600RegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstring> +#include <cstdint> +#include <iterator> +#include <utility> +#include <vector> using namespace llvm; @@ -191,7 +209,7 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const { const MachineFunction *MF = MI.getParent()->getParent(); return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) && usesVertexCache(MI.getOpcode())) || - usesTextureCache(MI.getOpcode()); + usesTextureCache(MI.getOpcode()); } bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { @@ -321,7 +339,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, unsigned &ConstCount) const { ConstCount = 0; const std::pair<int, unsigned> DummyPair(-1, 0); - std::vector<std::pair<int, unsigned> > Result; + std::vector<std::pair<int, unsigned>> Result; unsigned i = 0; for (const auto &Src : getSrcs(MI)) { ++i; @@ -348,8 +366,8 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, return Result; } -static std::vector<std::pair<int, unsigned> > -Swizzle(std::vector<std::pair<int, unsigned> > Src, +static std::vector<std::pair<int, unsigned>> +Swizzle(std::vector<std::pair<int, unsigned>> Src, R600InstrInfo::BankSwizzle Swz) { if (Src[0] == Src[1]) Src[1].first = -1; @@ -404,14 +422,14 @@ static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { /// in the same Instruction Group while meeting read port limitations given a /// Swz swizzle sequence. unsigned R600InstrInfo::isLegalUpTo( - const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + const std::vector<std::vector<std::pair<int, unsigned>>> &IGSrcs, const std::vector<R600InstrInfo::BankSwizzle> &Swz, - const std::vector<std::pair<int, unsigned> > &TransSrcs, + const std::vector<std::pair<int, unsigned>> &TransSrcs, R600InstrInfo::BankSwizzle TransSwz) const { int Vector[4][3]; memset(Vector, -1, sizeof(Vector)); for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) { - const std::vector<std::pair<int, unsigned> > &Srcs = + const std::vector<std::pair<int, unsigned>> &Srcs = Swizzle(IGSrcs[i], Swz[i]); for (unsigned j = 0; j < 3; j++) { const std::pair<int, unsigned> &Src = Srcs[j]; @@ -473,9 +491,9 @@ NextPossibleSolution( /// Enumerate all possible Swizzle sequence to find one that can meet all /// read port requirements. bool R600InstrInfo::FindSwizzleForVectorSlot( - const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, + const std::vector<std::vector<std::pair<int, unsigned>>> &IGSrcs, std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate, - const std::vector<std::pair<int, unsigned> > &TransSrcs, + const std::vector<std::pair<int, unsigned>> &TransSrcs, R600InstrInfo::BankSwizzle TransSwz) const { unsigned ValidUpTo = 0; do { @@ -490,7 +508,7 @@ bool R600InstrInfo::FindSwizzleForVectorSlot( /// a const, and can't read a gpr at cycle 1 if they read 2 const. static bool isConstCompatible(R600InstrInfo::BankSwizzle TransSwz, - const std::vector<std::pair<int, unsigned> > &TransOps, + const std::vector<std::pair<int, unsigned>> &TransOps, unsigned ConstCount) { // TransALU can't read 3 constants if (ConstCount > 2) @@ -516,7 +534,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, const { //Todo : support shared src0 - src1 operand - std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs; + std::vector<std::vector<std::pair<int, unsigned>>> IGSrcs; ValidSwizzle.clear(); unsigned ConstCount; BankSwizzle TransBS = ALU_VEC_012_SCL_210; @@ -527,7 +545,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) IG[i]->getOperand(Op).getImm()); } - std::vector<std::pair<int, unsigned> > TransOps; + std::vector<std::pair<int, unsigned>> TransOps; if (!isLastAluTrans) return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS); @@ -556,7 +574,6 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, return false; } - bool R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts) const { @@ -780,7 +797,7 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB, unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { - assert(!BytesRemoved && "code size not handled"); + assert(!BytesRemoved && "code size not handled"); // Note : we leave PRED* instructions there. // They may be needed when predicating instructions. @@ -852,7 +869,7 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const { } } -bool R600InstrInfo::isPredicable(MachineInstr &MI) const { +bool R600InstrInfo::isPredicable(const MachineInstr &MI) const { // XXX: KILL* instructions can be predicated, but they must be the last // instruction in a clause, so this means any instructions after them cannot // be predicated. Until we have proper support for instruction clauses in the @@ -863,7 +880,7 @@ bool R600InstrInfo::isPredicable(MachineInstr &MI) const { } else if (MI.getOpcode() == AMDGPU::CF_ALU) { // If the clause start in the middle of MBB then the MBB has more // than a single clause, unable to predicate several clauses. - if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI)) + if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI)) return false; // TODO: We don't support KC merging atm return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0; @@ -874,10 +891,9 @@ bool R600InstrInfo::isPredicable(MachineInstr &MI) const { } } - bool R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, + unsigned NumCycles, unsigned ExtraPredCycles, BranchProbability Probability) const{ return true; @@ -896,7 +912,7 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, bool R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, + unsigned NumCycles, BranchProbability Probability) const { return true; @@ -908,7 +924,6 @@ R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, return false; } - bool R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { MachineOperand &MO = Cond[1]; @@ -948,7 +963,6 @@ bool R600InstrInfo::DefinesPredicate(MachineInstr &MI, return isPredicateSetter(MI.getOpcode()); } - bool R600InstrInfo::PredicateInstruction(MachineInstr &MI, ArrayRef<MachineOperand> Pred) const { int PIdx = MI.findFirstPredOperandIdx(); @@ -1067,7 +1081,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } -void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, +void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const { const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); const R600FrameLowering *TFL = ST.getFrameLowering(); diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h index a280052dbd4a..3b828006807e 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.h +++ b/lib/Target/AMDGPU/R600InstrInfo.h @@ -177,12 +177,12 @@ public: bool isPredicated(const MachineInstr &MI) const override; - bool isPredicable(MachineInstr &MI) const override; + bool isPredicable(const MachineInstr &MI) const override; - bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, BranchProbability Probability) const override; - bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, BranchProbability Probability) const override ; diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index 9210e66b0fe7..bac557ba989e 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -316,7 +316,7 @@ class VTX_READ <string name, dag outs, list<dag> pattern> class LoadParamFrag <PatFrag load_type> : PatFrag < (ops node:$ptr), (load_type node:$ptr), [{ return isConstantLoad(cast<LoadSDNode>(N), 0) || - (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }] + (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.PARAM_I_ADDRESS); }] >; def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>; @@ -326,8 +326,8 @@ def vtx_id3_load : LoadParamFrag<load>; class LoadVtxId1 <PatFrag load> : PatFrag < (ops node:$ptr), (load node:$ptr), [{ const MemSDNode *LD = cast<MemSDNode>(N); - return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + return LD->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || + (LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && !isa<GlobalValue>(GetUnderlyingObject( LD->getMemOperand()->getValue(), CurDAG->getDataLayout()))); }]>; @@ -339,7 +339,7 @@ def vtx_id1_load : LoadVtxId1 <load>; class LoadVtxId2 <PatFrag load> : PatFrag < (ops node:$ptr), (load node:$ptr), [{ const MemSDNode *LD = cast<MemSDNode>(N); - return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + return LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && isa<GlobalValue>(GetUnderlyingObject( LD->getMemOperand()->getValue(), CurDAG->getDataLayout())); }]>; @@ -1013,7 +1013,7 @@ multiclass CUBE_Common <bits<11> inst> { (outs R600_Reg128:$dst), (ins R600_Reg128:$src0), "CUBE $dst $src0", - [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))], + [(set v4f32:$dst, (int_r600_cube v4f32:$src0))], VecALU > { let isPseudo = 1; diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index d70f52e0f295..b7e62075244b 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; @@ -34,15 +35,6 @@ namespace { typedef std::pair<BasicBlock *, Value *> StackEntry; typedef SmallVector<StackEntry, 16> StackVector; -// Intrinsic names the control flow is annotated with -static const char *const IfIntrinsic = "llvm.amdgcn.if"; -static const char *const ElseIntrinsic = "llvm.amdgcn.else"; -static const char *const BreakIntrinsic = "llvm.amdgcn.break"; -static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break"; -static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break"; -static const char *const LoopIntrinsic = "llvm.amdgcn.loop"; -static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf"; - class SIAnnotateControlFlow : public FunctionPass { DivergenceAnalysis *DA; @@ -56,13 +48,13 @@ class SIAnnotateControlFlow : public FunctionPass { UndefValue *BoolUndef; Constant *Int64Zero; - Constant *If; - Constant *Else; - Constant *Break; - Constant *IfBreak; - Constant *ElseBreak; - Constant *Loop; - Constant *EndCf; + Function *If; + Function *Else; + Function *Break; + Function *IfBreak; + Function *ElseBreak; + Function *Loop; + Function *EndCf; DominatorTree *DT; StackVector Stack; @@ -86,7 +78,8 @@ class SIAnnotateControlFlow : public FunctionPass { void insertElse(BranchInst *Term); Value *handleLoopCondition(Value *Cond, PHINode *Broken, - llvm::Loop *L, BranchInst *Term); + llvm::Loop *L, BranchInst *Term, + SmallVectorImpl<WeakVH> &LoopPhiConditions); void handleLoop(BranchInst *Term); @@ -118,6 +111,7 @@ public: INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) @@ -138,30 +132,13 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { BoolUndef = UndefValue::get(Boolean); Int64Zero = ConstantInt::get(Int64, 0); - If = M.getOrInsertFunction( - IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr); - - Else = M.getOrInsertFunction( - ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr); - - Break = M.getOrInsertFunction( - BreakIntrinsic, Int64, Int64, (Type *)nullptr); - cast<Function>(Break)->setDoesNotAccessMemory(); - - IfBreak = M.getOrInsertFunction( - IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr); - cast<Function>(IfBreak)->setDoesNotAccessMemory();; - - ElseBreak = M.getOrInsertFunction( - ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr); - cast<Function>(ElseBreak)->setDoesNotAccessMemory(); - - Loop = M.getOrInsertFunction( - LoopIntrinsic, Boolean, Int64, (Type *)nullptr); - - EndCf = M.getOrInsertFunction( - EndCfIntrinsic, Void, Int64, (Type *)nullptr); - + If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if); + Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else); + Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break); + IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break); + ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break); + Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop); + EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf); return false; } @@ -208,15 +185,16 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) { // \brief Erase "Phi" if it is not used any more void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { - if (!Phi->hasNUsesOrMore(1)) - Phi->eraseFromParent(); + if (llvm::RecursivelyDeleteDeadPHINode(Phi)) { + DEBUG(dbgs() << "Erased unused condition phi\n"); + } } /// \brief Open a new "If" block void SIAnnotateControlFlow::openIf(BranchInst *Term) { - if (isUniform(Term)) { + if (isUniform(Term)) return; - } + Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); @@ -233,8 +211,10 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { } /// \brief Recursively handle the condition leading to a loop -Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, - llvm::Loop *L, BranchInst *Term) { +Value *SIAnnotateControlFlow::handleLoopCondition( + Value *Cond, PHINode *Broken, + llvm::Loop *L, BranchInst *Term, + SmallVectorImpl<WeakVH> &LoopPhiConditions) { // Only search through PHI nodes which are inside the loop. If we try this // with PHI nodes that are outside of the loop, we end up inserting new PHI @@ -245,7 +225,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) { BasicBlock *Parent = Phi->getParent(); - PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); + PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front()); Value *Ret = NewPhi; // Handle all non-constant incoming values first @@ -258,14 +238,14 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, } Phi->setIncomingValue(i, BoolFalse); - Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term); + Value *PhiArg = handleLoopCondition(Incoming, Broken, L, + Term, LoopPhiConditions); NewPhi->addIncoming(PhiArg, From); } BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = Phi->getIncomingValue(i); if (Incoming != BoolTrue) continue; @@ -295,14 +275,17 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, continue; } } + TerminatorInst *Insert = From->getTerminator(); Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); NewPhi->setIncomingValue(i, PhiArg); } - eraseIfUnused(Phi); + + LoopPhiConditions.push_back(WeakVH(Phi)); return Ret; + } - } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { + if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { BasicBlock *Parent = Inst->getParent(); Instruction *Insert; if (L->contains(Inst)) { @@ -310,46 +293,55 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, } else { Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); } + Value *Args[] = { Cond, Broken }; return CallInst::Create(IfBreak, Args, "", Insert); + } - // Insert IfBreak before TERM for constant COND. - } else if (isa<ConstantInt>(Cond)) { - Value *Args[] = { Cond, Broken }; - return CallInst::Create(IfBreak, Args, "", Term); + // Insert IfBreak in the loop header TERM for constant COND other than true. + if (isa<Constant>(Cond)) { + Instruction *Insert = Cond == BoolTrue ? + Term : L->getHeader()->getTerminator(); - } else { - llvm_unreachable("Unhandled loop condition!"); + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Insert); } - return nullptr; + + llvm_unreachable("Unhandled loop condition!"); } /// \brief Handle a back edge (loop) void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { - if (isUniform(Term)) { + if (isUniform(Term)) return; - } BasicBlock *BB = Term->getParent(); llvm::Loop *L = LI->getLoopFor(BB); if (!L) return; + BasicBlock *Target = Term->getSuccessor(1); - PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); + PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front()); + SmallVector<WeakVH, 8> LoopPhiConditions; Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); - Value *Arg = handleLoopCondition(Cond, Broken, L, Term); + Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions); - for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); - PI != PE; ++PI) { + for (BasicBlock *Pred : predecessors(Target)) + Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred); + + Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); - Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI); + for (WeakVH Val : reverse(LoopPhiConditions)) { + if (PHINode *Cond = cast_or_null<PHINode>(Val)) + eraseIfUnused(Cond); } - Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); push(Term->getSuccessor(0), Arg); -}/// \brief Close the last opened control flow +} + +/// \brief Close the last opened control flow void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { llvm::Loop *L = LI->getLoopFor(BB); @@ -359,59 +351,62 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { // We can't insert an EndCF call into a loop header, because it will // get executed on every iteration of the loop, when it should be // executed only once before the loop. - SmallVector <BasicBlock*, 8> Latches; + SmallVector <BasicBlock *, 8> Latches; L->getLoopLatches(Latches); - std::vector<BasicBlock*> Preds; - for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { - if (!is_contained(Latches, *PI)) - Preds.push_back(*PI); + SmallVector<BasicBlock *, 2> Preds; + for (BasicBlock *Pred : predecessors(BB)) { + if (!is_contained(Latches, Pred)) + Preds.push_back(Pred); } + BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); } Value *Exec = popSaved(); - if (!isa<UndefValue>(Exec)) - CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt()); + Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt(); + if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) + CallInst::Create(EndCf, Exec, "", FirstInsertionPt); } /// \brief Annotate the control flow with intrinsics so the backend can /// recognize if/then/else and loops. bool SIAnnotateControlFlow::runOnFunction(Function &F) { - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DA = &getAnalysis<DivergenceAnalysis>(); for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { - - BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator()); + BasicBlock *BB = *I; + BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()); if (!Term || Term->isUnconditional()) { - if (isTopOfStack(*I)) - closeControlFlow(*I); + if (isTopOfStack(BB)) + closeControlFlow(BB); continue; } if (I.nodeVisited(Term->getSuccessor(1))) { - if (isTopOfStack(*I)) - closeControlFlow(*I); + if (isTopOfStack(BB)) + closeControlFlow(BB); handleLoop(Term); continue; } - if (isTopOfStack(*I)) { + if (isTopOfStack(BB)) { PHINode *Phi = dyn_cast<PHINode>(Term->getCondition()); - if (Phi && Phi->getParent() == *I && isElse(Phi)) { + if (Phi && Phi->getParent() == BB && isElse(Phi)) { insertElse(Term); eraseIfUnused(Phi); continue; } - closeControlFlow(*I); + + closeControlFlow(BB); } + openIf(Term); } diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index ff4e32147184..3dd372b32866 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -36,6 +36,7 @@ enum : uint64_t { // TODO: Should this be spilt into VOP3 a and b? VOP3 = 1 << 10, + VOP3P = 1 << 12, VINTRP = 1 << 13, SDWA = 1 << 14, @@ -65,8 +66,8 @@ enum : uint64_t { SOPK_ZEXT = UINT64_C(1) << 38, SCALAR_STORE = UINT64_C(1) << 39, FIXED_SIZE = UINT64_C(1) << 40, - VOPAsmPrefer32Bit = UINT64_C(1) << 41 - + VOPAsmPrefer32Bit = UINT64_C(1) << 41, + HasFPClamp = UINT64_C(1) << 42 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. @@ -102,12 +103,14 @@ namespace AMDGPU { OPERAND_REG_INLINE_C_FP16, OPERAND_REG_INLINE_C_FP32, OPERAND_REG_INLINE_C_FP64, + OPERAND_REG_INLINE_C_V2FP16, + OPERAND_REG_INLINE_C_V2INT16, OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32, OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16, OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16, - OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64, + OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_V2INT16, OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32, OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST, @@ -125,9 +128,12 @@ namespace AMDGPU { // NEG and SEXT share same bit-mask because they can't be set simultaneously. namespace SISrcMods { enum { - NEG = 1 << 0, // Floating-point negate modifier - ABS = 1 << 1, // Floating-point absolute modifier - SEXT = 1 << 0 // Integer sign-extend modifier + NEG = 1 << 0, // Floating-point negate modifier + ABS = 1 << 1, // Floating-point absolute modifier + SEXT = 1 << 0, // Integer sign-extend modifier + NEG_HI = ABS, // Floating-point negate high packed component modifier. + OP_SEL_0 = 1 << 2, + OP_SEL_1 = 1 << 3 }; } @@ -242,6 +248,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_LDS_ALLOC = 6, ID_IB_STS = 7, ID_SYMBOLIC_LAST_ = 8, + ID_MEM_BASES = 15, ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) @@ -251,14 +258,20 @@ enum Offset { // Offset, (5) [10:6] OFFSET_DEFAULT_ = 0, OFFSET_SHIFT_ = 6, OFFSET_WIDTH_ = 5, - OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_) + OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_), + + OFFSET_SRC_SHARED_BASE = 16, + OFFSET_SRC_PRIVATE_BASE = 0 }; enum WidthMinusOne { // WidthMinusOne, (5) [15:11] WIDTH_M1_DEFAULT_ = 31, WIDTH_M1_SHIFT_ = 11, WIDTH_M1_WIDTH_ = 5, - WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_) + WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_), + + WIDTH_M1_SRC_SHARED_BASE = 15, + WIDTH_M1_SRC_PRIVATE_BASE = 15 }; } // namespace Hwreg @@ -300,6 +313,9 @@ enum DstUnused { #define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) #define G_00B84C_USER_SGPR(x) (((x) >> 1) & 0x1F) #define C_00B84C_USER_SGPR 0xFFFFFFC1 +#define S_00B84C_TRAP_HANDLER(x) (((x) & 0x1) << 6) +#define G_00B84C_TRAP_HANDLER(x) (((x) >> 6) & 0x1) +#define C_00B84C_TRAP_HANDLER 0xFFFFFFBF #define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) #define G_00B84C_TGID_X_EN(x) (((x) >> 7) & 0x1) #define C_00B84C_TGID_X_EN 0xFFFFFF7F @@ -387,7 +403,6 @@ enum DstUnused { #define R_SPILLED_SGPRS 0x4 #define R_SPILLED_VGPRS 0x8 - } // End namespace llvm #endif diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 6a422e70fe1f..f9d258f44a62 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -65,6 +65,7 @@ /// ultimately led to the creation of an illegal COPY. //===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseSet.h" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" @@ -198,6 +199,10 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, if (!CopyUse.isCopy()) return false; + // It is illegal to have vreg inputs to a physreg defining reg_sequence. + if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg())) + return false; + const TargetRegisterClass *SrcRC, *DstRC; std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); @@ -234,8 +239,9 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); - BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) - .addOperand(MI.getOperand(I)); + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), + TmpReg) + .add(MI.getOperand(I)); MI.getOperand(I).setReg(TmpReg); } @@ -326,6 +332,27 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, return true; } +static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, + const TargetRegisterInfo *TRI) { + DenseSet<MachineBasicBlock*> Visited; + SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(), + MBB->pred_end()); + + while (!Worklist.empty()) { + MachineBasicBlock *mbb = Worklist.back(); + Worklist.pop_back(); + + if (!Visited.insert(mbb).second) + continue; + if (hasTerminatorThatModifiesExec(*mbb, *TRI)) + return true; + + Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end()); + } + + return false; +} + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -382,8 +409,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); - MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1); - if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) { + if (!predsHasDivergentTerminator(MBB0, TRI) && + !predsHasDivergentTerminator(MBB1, TRI)) { DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n'); break; } diff --git a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp new file mode 100644 index 000000000000..3d3121788b5e --- /dev/null +++ b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp @@ -0,0 +1,72 @@ +//===-- SIFixVGPRCopies.cpp - Fix VGPR Copies after regalloc --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Add implicit use of exec to vector register copies. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-vgpr-copies" + +namespace { + +class SIFixVGPRCopies : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixVGPRCopies() : MachineFunctionPass(ID) { + initializeSIFixVGPRCopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Fix VGPR copies"; } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIFixVGPRCopies, DEBUG_TYPE, "SI Fix VGPR copies", false, false) + +char SIFixVGPRCopies::ID = 0; + +char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID; + +bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) { + MI.addOperand(MF, + MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + DEBUG(dbgs() << "Add exec use to " << MI); + Changed = true; + } + break; + default: + break; + } + } + } + + return Changed; +} diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index a5c0d4923d6b..d63414735b95 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -12,6 +12,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -66,6 +67,7 @@ public: MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; + const SISubtarget *ST; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, @@ -75,6 +77,12 @@ public: void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + const MachineOperand *isClamp(const MachineInstr &MI) const; + bool tryFoldClamp(MachineInstr &MI); + + std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; + bool tryFoldOMod(MachineInstr &MI); + public: SIFoldOperands() : MachineFunctionPass(ID) { initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); @@ -131,27 +139,6 @@ FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } -static bool isSafeToFold(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - case AMDGPU::V_MOV_B64_PSEUDO: { - // If there are additional implicit register operands, this may be used for - // register indexing so the source register operand isn't simply copied. - unsigned NumOps = MI.getDesc().getNumOperands() + - MI.getDesc().getNumImplicitUses(); - - return MI.getNumOperands() == NumOps; - } - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::COPY: - return true; - default: - return false; - } -} - static bool updateOperand(FoldCandidate &Fold, const TargetRegisterInfo &TRI) { MachineInstr *MI = Fold.UseMI; @@ -359,8 +346,6 @@ void SIFoldOperands::foldOperand( const TargetRegisterClass *FoldRC = TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); - APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType), - OpToFold.getImm()); // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { @@ -370,21 +355,25 @@ void SIFoldOperands::foldOperand( MRI->getRegClass(UseReg) : TRI->getPhysRegClass(UseReg); - assert(Imm.getBitWidth() == 64); - if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) return; + APInt Imm(64, OpToFold.getImm()); if (UseOp.getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { assert(UseOp.getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } + + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + return; } - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + + + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); } static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, @@ -581,6 +570,32 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, return false; } +// Try to fold an instruction into a simpler one +static bool tryFoldInst(const SIInstrInfo *TII, + MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + + if (Opc == AMDGPU::V_CNDMASK_B32_e32 || + Opc == AMDGPU::V_CNDMASK_B32_e64 || + Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { + const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); + if (Src1->isIdenticalTo(*Src0)) { + DEBUG(dbgs() << "Folded " << *MI << " into "); + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx != -1) + MI->RemoveOperand(Src2Idx); + MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); + mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY + : getMovOpc(false))); + DEBUG(dbgs() << *MI << '\n'); + return true; + } + } + + return false; +} + void SIFoldOperands::foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const { // We need mutate the operands of new mov instructions to add implicit @@ -682,20 +697,213 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); + tryFoldInst(TII, Fold.UseMI); } } } +const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { + unsigned Op = MI.getOpcode(); + switch (Op) { + case AMDGPU::V_MAX_F32_e64: + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F64: { + if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) + return nullptr; + + // Make sure sources are identical. + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src0->isReg() || Src0->getSubReg() != Src1->getSubReg() || + Src0->getSubReg() != AMDGPU::NoSubRegister) + return nullptr; + + // Can't fold up if we have modifiers. + if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return nullptr; + return Src0; + } + default: + return nullptr; + } +} + +// We obviously have multiple uses in a clamp since the register is used twice +// in the same instruction. +static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { + int Count = 0; + for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end(); + I != E; ++I) { + if (++Count > 1) + return false; + } + + return true; +} + +bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { + const MachineOperand *ClampSrc = isClamp(MI); + if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) + return false; + + MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); + if (!TII->hasFPClamp(*Def)) + return false; + MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); + if (!DefClamp) + return false; + + DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n'); + + // Clamp is applied after omod, so it is OK if omod is set. + DefClamp->setImm(1); + MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; +} + +static int getOModValue(unsigned Opc, int64_t Val) { + switch (Opc) { + case AMDGPU::V_MUL_F32_e64: { + switch (static_cast<uint32_t>(Val)) { + case 0x3f000000: // 0.5 + return SIOutMods::DIV2; + case 0x40000000: // 2.0 + return SIOutMods::MUL2; + case 0x40800000: // 4.0 + return SIOutMods::MUL4; + default: + return SIOutMods::NONE; + } + } + case AMDGPU::V_MUL_F16_e64: { + switch (static_cast<uint16_t>(Val)) { + case 0x3800: // 0.5 + return SIOutMods::DIV2; + case 0x4000: // 2.0 + return SIOutMods::MUL2; + case 0x4400: // 4.0 + return SIOutMods::MUL4; + default: + return SIOutMods::NONE; + } + } + default: + llvm_unreachable("invalid mul opcode"); + } +} + +// FIXME: Does this really not support denormals with f16? +// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not +// handled, so will anything other than that break? +std::pair<const MachineOperand *, int> +SIFoldOperands::isOMod(const MachineInstr &MI) const { + unsigned Op = MI.getOpcode(); + switch (Op) { + case AMDGPU::V_MUL_F32_e64: + case AMDGPU::V_MUL_F16_e64: { + // If output denormals are enabled, omod is ignored. + if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) || + (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals())) + return std::make_pair(nullptr, SIOutMods::NONE); + + const MachineOperand *RegOp = nullptr; + const MachineOperand *ImmOp = nullptr; + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src0->isImm()) { + ImmOp = Src0; + RegOp = Src1; + } else if (Src1->isImm()) { + ImmOp = Src1; + RegOp = Src0; + } else + return std::make_pair(nullptr, SIOutMods::NONE); + + int OMod = getOModValue(Op, ImmOp->getImm()); + if (OMod == SIOutMods::NONE || + TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod) || + TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return std::make_pair(nullptr, SIOutMods::NONE); + + return std::make_pair(RegOp, OMod); + } + case AMDGPU::V_ADD_F32_e64: + case AMDGPU::V_ADD_F16_e64: { + // If output denormals are enabled, omod is ignored. + if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) || + (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals())) + return std::make_pair(nullptr, SIOutMods::NONE); + + // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + + if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() && + Src0->getSubReg() == Src1->getSubReg() && + !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return std::make_pair(Src0, SIOutMods::MUL2); + + return std::make_pair(nullptr, SIOutMods::NONE); + } + default: + return std::make_pair(nullptr, SIOutMods::NONE); + } +} + +// FIXME: Does this need to check IEEE bit on function? +bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { + const MachineOperand *RegOp; + int OMod; + std::tie(RegOp, OMod) = isOMod(MI); + if (OMod == SIOutMods::NONE || !RegOp->isReg() || + RegOp->getSubReg() != AMDGPU::NoSubRegister || + !hasOneNonDBGUseInst(*MRI, RegOp->getReg())) + return false; + + MachineInstr *Def = MRI->getVRegDef(RegOp->getReg()); + MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod); + if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) + return false; + + // Clamp is applied after omod. If the source already has clamp set, don't + // fold it. + if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) + return false; + + DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n'); + + DefOMod->setImm(OMod); + MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - MRI = &MF.getRegInfo(); - TII = ST.getInstrInfo(); + ST = &MF.getSubtarget<SISubtarget>(); + TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // omod is ignored by hardware if IEEE bit is enabled. omod also does not + // correctly handle signed zeros. + // + // TODO: Check nsz on instructions when fast math flags are preserved to MI + // level. + bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath(); + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { @@ -705,8 +913,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (!isSafeToFold(MI)) + tryFoldInst(TII, &MI); + + if (!TII->isFoldableCopy(MI)) { + if (IsIEEEMode || !tryFoldOMod(MI)) + tryFoldClamp(MI); continue; + } MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 0b5715515880..abe6af9a6d3f 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -21,22 +21,24 @@ using namespace llvm; -static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF, - const SIRegisterInfo *TRI) { +static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST, + const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), - TRI->getMaxNumSGPRs(MF) / 4); + ST.getMaxNumSGPRs(MF) / 4); } -static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF, - const SIRegisterInfo *TRI) { +static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST, + const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), - TRI->getMaxNumSGPRs(MF)); + ST.getMaxNumSGPRs(MF)); } -void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, - const SIRegisterInfo* TRI, +void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const { + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + // We don't need this if we only have spills since there is no user facing // scratch. @@ -59,16 +61,28 @@ void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII, MRI.addLiveIn(FlatScratchInitReg); MBB.addLiveIn(FlatScratchInitReg); - // Copy the size in bytes. - unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) - .addReg(FlatScrInitHi, RegState::Kill); - unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + // Do a 64-bit pointer add. + if (ST.flatScratchIsPointer()) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) + .addReg(FlatScrInitHi) + .addImm(0); + + return; + } + + // Copy the size in bytes. + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitHi, RegState::Kill); + // Add wave offset in bytes to private base offset. // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) @@ -111,16 +125,15 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; - ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI); + ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); - // Skip the last 2 elements because the last one is reserved for VCC, and - // this is the 2nd to last element already. + // Skip the last N reserved elements because they should have already been + // reserved for VCC etc. for (MCPhysReg Reg : AllSGPR128s) { // Pick the first unallocated one. Make sure we don't clobber the other // reserved input we needed. if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { - //assert(MRI.isAllocatable(Reg)); MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -143,10 +156,9 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI); + ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) return ScratchWaveOffsetReg; @@ -190,6 +202,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was // specified. const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + auto AMDGPUASI = ST.getAMDGPUAS(); if (ST.debuggerEmitPrologue()) emitDebuggerPrologue(MF, MBB); @@ -229,7 +242,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // emitted after frame indices are eliminated. if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) - emitFlatScratchInit(TII, TRI, MF, MBB); + emitFlatScratchInit(ST, MF, MBB); // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( @@ -328,7 +341,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, PointerType *PtrTy = PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()), - AMDGPUAS::CONSTANT_ADDRESS); + AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); auto MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | @@ -371,6 +384,24 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, } +static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { + for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); + I != E; ++I) { + if (!MFI.isDeadObjectIndex(I)) + return false; + } + + return true; +} + +int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + + FrameReg = RI->getFrameRegister(MF); + return MF.getFrameInfo().getObjectOffset(FI); +} + void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { @@ -379,15 +410,66 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (!MFI.hasStackObjects()) return; - bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + bool AllSGPRSpilledToVGPRs = false; + + if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) { + AllSGPRSpilledToVGPRs = true; + + // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs + // are spilled to VGPRs, in which case we can eliminate the stack usage. + // + // XXX - This operates under the assumption that only other SGPR spills are + // users of the frame index. I'm not 100% sure this is correct. The + // StackColoring pass has a comment saying a future improvement would be to + // merging of allocas with spill slots, but for now according to + // MachineFrameInfo isSpillSlot can't alias any other object. + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (TII->isSGPRSpill(MI)) { + int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { + bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); + (void)Spilled; + assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + } else + AllSGPRSpilledToVGPRs = false; + } + } + } - assert((RS || !MayNeedScavengingEmergencySlot) && - "RegScavenger required if spilling"); + FuncInfo->removeSGPRToVGPRFrameIndices(MFI); + } - if (MayNeedScavengingEmergencySlot) { - int ScavengeFI = MFI.CreateStackObject( - AMDGPU::SGPR_32RegClass.getSize(), - AMDGPU::SGPR_32RegClass.getAlignment(), false); + // FIXME: The other checks should be redundant with allStackObjectsAreDead, + // but currently hasNonSpillStackObjects is set only from source + // allocas. Stack temps produced from legalization are not counted currently. + if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() || + !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) { + assert(RS && "RegScavenger required if spilling"); + + // We force this to be at offset 0 so no user object ever has 0 as an + // address, so we may use 0 as an invalid pointer value. This is because + // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca + // is required to be address space 0, we are forced to accept this for + // now. Ideally we could have the stack in another address space with 0 as a + // valid pointer, and -1 as the null value. + // + // This will also waste additional space when user stack objects require > 4 + // byte alignment. + // + // The main cost here is losing the offset for addressing modes. However + // this also ensures we shouldn't need a register for the offset when + // emergency scavenging. + int ScavengeFI = MFI.CreateFixedObject( + AMDGPU::SGPR_32RegClass.getSize(), 0, false); RS->addScavengingFrameIndex(ScavengeFI); } } diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index 7657b4e03864..1bfc08093da2 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -30,14 +30,15 @@ public: MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; private: - void emitFlatScratchInit(const SIInstrInfo *TII, - const SIRegisterInfo* TRI, + void emitFlatScratchInit(const SISubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index b98f9f400ee7..7268131396dc 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,26 +15,70 @@ #ifdef _MSC_VER // Provide M_PI. #define _USE_MATH_DEFINES -#include <cmath> #endif #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUTargetMachine.h" #include "AMDGPUSubtarget.h" #include "SIDefines.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/DAGCombine.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetCallingConv.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <cassert> +#include <cmath> +#include <cstdint> +#include <iterator> +#include <tuple> +#include <utility> +#include <vector> using namespace llvm; @@ -43,7 +87,6 @@ static cl::opt<bool> EnableVGPRIndexMode( cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); - static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { @@ -84,6 +127,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); } + if (Subtarget->hasVOP3PInsts()) { + addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); + } + computeRegisterProperties(STI.getRegisterInfo()); // We need to custom lower vector stores from local memory @@ -110,7 +158,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); @@ -142,10 +189,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); @@ -153,9 +207,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::f32, Expand); setOperationAction(ISD::BR_CC, MVT::f64, Expand); + setOperationAction(ISD::UADDO, MVT::i32, Legal); + setOperationAction(ISD::USUBO, MVT::i32, Legal); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, + MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -202,6 +260,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + // Avoid stack access for these. + // TODO: Generalize to more vector types. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); @@ -222,7 +287,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // On SI this is s_memtime and s_memrealtime on VI. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - setOperationAction(ISD::TRAP, MVT::Other, Custom); + setOperationAction(ISD::TRAP, MVT::Other, Legal); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); @@ -303,6 +369,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Custom); // F16 - VOP2 Actions. setOperationAction(ISD::BR_CC, MVT::f16, Expand); @@ -317,6 +384,85 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAD, MVT::f16, Legal); } + if (Subtarget->hasVOP3PInsts()) { + for (MVT VT : {MVT::v2i16, MVT::v2f16}) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { + switch (Op) { + case ISD::LOAD: + case ISD::STORE: + case ISD::BUILD_VECTOR: + case ISD::BITCAST: + case ISD::EXTRACT_VECTOR_ELT: + case ISD::INSERT_VECTOR_ELT: + case ISD::INSERT_SUBVECTOR: + case ISD::EXTRACT_SUBVECTOR: + case ISD::SCALAR_TO_VECTOR: + break; + case ISD::CONCAT_VECTORS: + setOperationAction(Op, VT, Custom); + break; + default: + setOperationAction(Op, VT, Expand); + break; + } + } + } + + // XXX - Do these do anything? Vector constants turn into build_vector. + setOperationAction(ISD::Constant, MVT::v2i16, Legal); + setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); + + setOperationAction(ISD::STORE, MVT::v2i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); + setOperationAction(ISD::STORE, MVT::v2f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32); + + setOperationAction(ISD::LOAD, MVT::v2i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); + setOperationAction(ISD::LOAD, MVT::v2f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); + + setOperationAction(ISD::AND, MVT::v2i16, Promote); + AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); + setOperationAction(ISD::OR, MVT::v2i16, Promote); + AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); + setOperationAction(ISD::XOR, MVT::v2i16, Promote); + AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2i16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2f16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); + + setOperationAction(ISD::ADD, MVT::v2i16, Legal); + setOperationAction(ISD::SUB, MVT::v2i16, Legal); + setOperationAction(ISD::MUL, MVT::v2i16, Legal); + setOperationAction(ISD::SHL, MVT::v2i16, Legal); + setOperationAction(ISD::SRL, MVT::v2i16, Legal); + setOperationAction(ISD::SRA, MVT::v2i16, Legal); + setOperationAction(ISD::SMIN, MVT::v2i16, Legal); + setOperationAction(ISD::UMIN, MVT::v2i16, Legal); + setOperationAction(ISD::SMAX, MVT::v2i16, Legal); + setOperationAction(ISD::UMAX, MVT::v2i16, Legal); + + setOperationAction(ISD::FADD, MVT::v2f16, Legal); + setOperationAction(ISD::FNEG, MVT::v2f16, Legal); + setOperationAction(ISD::FMUL, MVT::v2f16, Legal); + setOperationAction(ISD::FMA, MVT::v2f16, Legal); + setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); + + // This isn't really legal, but this avoids the legalizer unrolling it (and + // allows matching fneg (fabs x) patterns) + setOperationAction(ISD::FABS, MVT::v2f16, Legal); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + } + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -332,6 +478,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::FCANONICALIZE); + setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); + setTargetDAGCombine(ISD::ZERO_EXTEND); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -364,30 +512,49 @@ const SISubtarget *SITargetLowering::getSubtarget() const { // TargetLowering queries //===----------------------------------------------------------------------===// +bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, + EVT) const { + // SI has some legal vector types, but no legal vector operations. Say no + // shuffles are legal in order to prefer scalarizing some vector operations. + return false; +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, unsigned IntrID) const { switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_dec: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); Info.align = 0; - Info.vol = false; + + const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); + Info.vol = !Vol || !Vol->isNullValue(); Info.readMem = true; Info.writeMem = true; return true; + } default: return false; } } -bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, - EVT) const { - // SI has some legal vector types, but no legal vector operations. Say no - // shuffles are legal in order to prefer scalarizing some vector operations. - return false; +bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, + SmallVectorImpl<Value*> &Ops, + Type *&AccessTy) const { + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: { + Value *Ptr = II->getArgOperand(0); + AccessTy = II->getType(); + Ops.push_back(Ptr); + return true; + } + default: + return false; + } } bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { @@ -438,8 +605,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AM.BaseGV) return false; - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: { + if (AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { // Assume the we will use FLAT for all global memory accesses // on VI. @@ -454,8 +620,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } return isLegalMUBUFAddressingMode(AM); - } - case AMDGPUAS::CONSTANT_ADDRESS: { + } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -478,7 +643,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // in 8-bits, it can use a smaller encoding. if (!isUInt<32>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) { + } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; @@ -492,13 +657,11 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; return false; - } - case AMDGPUAS::PRIVATE_ADDRESS: + } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { return isLegalMUBUFAddressingMode(AM); - - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: { + } else if (AS == AMDGPUASI.LOCAL_ADDRESS || + AS == AMDGPUASI.REGION_ADDRESS) { // Basic, single offset DS instructions allow a 16-bit unsigned immediate // field. // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have @@ -513,17 +676,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; return false; - } - case AMDGPUAS::FLAT_ADDRESS: - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: + } else if (AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) { // For an unknown address space, this usually means that this is for some // reason being used for pure arithmetic, and not based on some addressing // computation. We don't have instructions that compute pointers with any // addressing modes, so treat them as having no offset like flat // instructions. return isLegalFlatAddressingMode(AM); - - default: + } else { llvm_unreachable("unhandled address space"); } } @@ -544,8 +705,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return false; } - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || - AddrSpace == AMDGPUAS::REGION_ADDRESS) { + if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS || + AddrSpace == AMDGPUASI.REGION_ADDRESS) { // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. @@ -560,8 +721,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // will access scratch. If we had access to the IR function, then we // could determine if any private memory was used in the function. if (!Subtarget->hasUnalignedScratchAccess() && - (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || - AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { + (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS || + AddrSpace == AMDGPUASI.FLAT_ADDRESS)) { return false; } @@ -569,7 +730,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { - *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? + *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ? (Align % 4 == 0) : true; } @@ -609,15 +770,16 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, return MVT::Other; } -static bool isFlatGlobalAddrSpace(unsigned AS) { - return AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; +static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) { + return AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { - return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); + return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) && + isFlatGlobalAddrSpace(DestAS, AMDGPUASI); } bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { @@ -631,7 +793,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { // Flat -> private/local is a simple truncate. // Flat -> global is no-op - if (SrcAS == AMDGPUAS::FLAT_ADDRESS) + if (SrcAS == AMDGPUASI.FLAT_ADDRESS) return true; return isNoopAddrSpaceCast(SrcAS, DestAS); @@ -639,18 +801,8 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, bool SITargetLowering::isMemOpUniform(const SDNode *N) const { const MemSDNode *MemNode = cast<MemSDNode>(N); - const Value *Ptr = MemNode->getMemOperand()->getValue(); - // UndefValue means this is a load of a kernel input. These are uniform. - // Sometimes LDS instructions have constant pointers. - // If Ptr is null, then that means this mem operand contains a - // PseudoSourceValue like GOT. - if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || - isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) - return true; - - const Instruction *I = dyn_cast<Instruction>(Ptr); - return I && I->getMetadata("amdgpu.uniform"); + return AMDGPU::isUniformMMO(MemNode->getMemOperand()); } TargetLoweringBase::LegalizeTypeAction @@ -693,40 +845,28 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { return TargetLowering::isTypeDesirableForOp(Op, VT); } -SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, - const SDLoc &SL, SDValue Chain, - unsigned Offset) const { +SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + uint64_t Offset) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, + SIRegisterInfo::KERNARG_SEGMENT_PTR); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg), PtrVT); return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); } -SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, - const SDLoc &SL, SDValue Chain, - unsigned Offset, bool Signed, +SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Val, + bool Signed, const ISD::InputArg *Arg) const { - const DataLayout &DL = DAG.getDataLayout(); - Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); - MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - - unsigned Align = DL.getABITypeAlignment(Ty); - - SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); - SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, - MachineMemOperand::MONonTemporal | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); - - SDValue Val = Load; if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) { unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; @@ -740,373 +880,434 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, else Val = DAG.getZExtOrTrunc(Val, SL, VT); - return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); + return Val; } -SDValue SITargetLowering::LowerFormalArguments( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, - SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); - - MachineFunction &MF = DAG.getMachineFunction(); - FunctionType *FType = MF.getFunction()->getFunctionType(); - SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); +SDValue SITargetLowering::lowerKernargMemParameter( + SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Chain, + uint64_t Offset, bool Signed, + const ISD::InputArg *Arg) const { + const DataLayout &DL = DAG.getDataLayout(); + Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { - const Function *Fn = MF.getFunction(); - DiagnosticInfoUnsupported NoGraphicsHSA( - *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); - DAG.getContext()->diagnose(NoGraphicsHSA); - return DAG.getEntryNode(); - } + unsigned Align = DL.getABITypeAlignment(Ty); - // Create stack objects that are used for emitting debugger prologue if - // "amdgpu-debugger-emit-prologue" attribute was specified. - if (ST.debuggerEmitPrologue()) - createDebuggerPrologueStackObjects(MF); + SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); + SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, + MachineMemOperand::MONonTemporal | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); - SmallVector<ISD::InputArg, 16> Splits; - BitVector Skipped(Ins.size()); + SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); + return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); +} - for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { - const ISD::InputArg &Arg = Ins[i]; +static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, + CallingConv::ID CallConv, + ArrayRef<ISD::InputArg> Ins, + BitVector &Skipped, + FunctionType *FType, + SIMachineFunctionInfo *Info) { + for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { + const ISD::InputArg &Arg = Ins[I]; - // First check if it's a PS input addr + // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && !Arg.Flags.isByVal() && PSInputNum <= 15) { if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { - // We can safely skip PS inputs - Skipped.set(i); + // We can safely skip PS inputs. + Skipped.set(I); ++PSInputNum; continue; } Info->markPSInputAllocated(PSInputNum); if (Arg.Used) - Info->PSInputEna |= 1 << PSInputNum; + Info->markPSInputEnabled(PSInputNum); ++PSInputNum; } - if (AMDGPU::isShader(CallConv)) { - // Second split vertices into their elements - if (Arg.VT.isVector()) { - ISD::InputArg NewArg = Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg.VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eight. - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned j = 0; j != NumElements; ++j) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); - } - } else { - Splits.push_back(Arg); + // Second split vertices into their elements. + if (Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eight. + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned J = 0; J != NumElements; ++J) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); } + } else { + Splits.push_back(Arg); } } +} - SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); +// Allocate special inputs passed in VGPRs. +static void allocateSpecialInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + if (Info.hasWorkItemIDX()) { + unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } - // At least one interpolation mode must be enabled or else the GPU will hang. - // - // Check PSInputAddr instead of PSInputEna. The idea is that if the user set - // PSInputAddr, the user wants to enable some bits after the compilation - // based on run-time states. Since we can't know what the final PSInputEna - // will look like, so we shouldn't do anything here and the user should take - // responsibility for the correct programming. - // - // Otherwise, the following restrictions apply: - // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. - // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be - // enabled too. - if (CallConv == CallingConv::AMDGPU_PS && - ((Info->getPSInputAddr() & 0x7F) == 0 || - ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { - CCInfo.AllocateReg(AMDGPU::VGPR0); - CCInfo.AllocateReg(AMDGPU::VGPR1); - Info->markPSInputAllocated(0); - Info->PSInputEna |= 1; - } - - if (!AMDGPU::isShader(CallConv)) { - assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); - } else { - assert(!Info->hasDispatchPtr() && - !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && - !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && - !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); + if (Info.hasWorkItemIDY()) { + unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); } - if (Info->hasPrivateMemoryInputPtr()) { - unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI); - MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass); + if (Info.hasWorkItemIDZ()) { + unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } +} + +// Allocate special inputs passed in user SGPRs. +static void allocateHSAUserSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + if (Info.hasPrivateMemoryInputPtr()) { + unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI); + MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(PrivateMemoryPtrReg); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info->hasPrivateSegmentBuffer()) { - unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); - MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + if (Info.hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } - if (Info->hasDispatchPtr()) { - unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + if (Info.hasDispatchPtr()) { + unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } - if (Info->hasQueuePtr()) { - unsigned QueuePtrReg = Info->addQueuePtr(*TRI); + if (Info.hasQueuePtr()) { + unsigned QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } - if (Info->hasKernargSegmentPtr()) { - unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + if (Info.hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI); MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(InputPtrReg); } - if (Info->hasDispatchID()) { - unsigned DispatchIDReg = Info->addDispatchID(*TRI); + if (Info.hasDispatchID()) { + unsigned DispatchIDReg = Info.addDispatchID(TRI); MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info->hasFlatScratchInit()) { - unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + if (Info.hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); } - if (!AMDGPU::isShader(CallConv)) - analyzeFormalArgumentsCompute(CCInfo, Ins); - else - AnalyzeFormalArguments(CCInfo, Splits); - - SmallVector<SDValue, 16> Chains; - - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { - - const ISD::InputArg &Arg = Ins[i]; - if (Skipped[i]) { - InVals.push_back(DAG.getUNDEF(Arg.VT)); - continue; - } - - CCValAssign &VA = ArgLocs[ArgIdx++]; - MVT VT = VA.getLocVT(); - - if (VA.isMemLoc()) { - VT = Ins[i].VT; - EVT MemVT = VA.getLocVT(); - const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + - VA.getLocMemOffset(); - // The first 36 bytes of the input buffer contains information about - // thread group and global sizes. - SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, - Offset, Ins[i].Flags.isSExt(), - &Ins[i]); - Chains.push_back(Arg.getValue(1)); - - auto *ParamTy = - dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && - ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - // On SI local pointers are just offsets into LDS, so they are always - // less than 16-bits. On CI and newer they could potentially be - // real pointers, so we can't guarantee their size. - Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, - DAG.getValueType(MVT::i16)); - } - - InVals.push_back(Arg); - Info->setABIArgOffset(Offset + MemVT.getStoreSize()); - continue; - } - assert(VA.isRegLoc() && "Parameter must be in a register!"); - - unsigned Reg = VA.getLocReg(); - - if (VT == MVT::i64) { - // For now assume it is a pointer - Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, - &AMDGPU::SGPR_64RegClass); - Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass); - SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); - InVals.push_back(Copy); - continue; - } - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - - Reg = MF.addLiveIn(Reg, RC); - SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - - if (Arg.VT.isVector()) { - - // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - SmallVector<SDValue, 4> Regs; - Regs.push_back(Val); - for (unsigned j = 1; j != NumElements; ++j) { - Reg = ArgLocs[ArgIdx++].getLocReg(); - Reg = MF.addLiveIn(Reg, RC); - - SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); - Regs.push_back(Copy); - } - - // Fill up the missing vector elements - NumElements = Arg.VT.getVectorNumElements() - NumElements; - Regs.append(NumElements, DAG.getUNDEF(VT)); - - InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); - continue; - } - - InVals.push_back(Val); - } - // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read // these from the dispatch pointer. +} - // Start adding system SGPRs. - if (Info->hasWorkGroupIDX()) { - unsigned Reg = Info->addWorkGroupIDX(); +// Allocate special input registers that are initialized per-wave. +static void allocateSystemSGPRs(CCState &CCInfo, + MachineFunction &MF, + SIMachineFunctionInfo &Info, + bool IsShader) { + if (Info.hasWorkGroupIDX()) { + unsigned Reg = Info.addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasWorkGroupIDY()) { - unsigned Reg = Info->addWorkGroupIDY(); + if (Info.hasWorkGroupIDY()) { + unsigned Reg = Info.addWorkGroupIDY(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasWorkGroupIDZ()) { - unsigned Reg = Info->addWorkGroupIDZ(); + if (Info.hasWorkGroupIDZ()) { + unsigned Reg = Info.addWorkGroupIDZ(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasWorkGroupInfo()) { - unsigned Reg = Info->addWorkGroupInfo(); + if (Info.hasWorkGroupInfo()) { + unsigned Reg = Info.addWorkGroupInfo(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasPrivateSegmentWaveByteOffset()) { + if (Info.hasPrivateSegmentWaveByteOffset()) { // Scratch wave offset passed in system SGPR. unsigned PrivateSegmentWaveByteOffsetReg; - if (AMDGPU::isShader(CallConv)) { + if (IsShader) { PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); - Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); } else - PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); + PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); } +} +static void reservePrivateMemoryRegs(const TargetMachine &TM, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. bool HasStackObjects = MF.getFrameInfo().hasStackObjects(); + // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. if (HasStackObjects) - Info->setHasNonSpillStackObjects(true); + Info.setHasNonSpillStackObjects(true); // Everything live out of a block is spilled with fast regalloc, so it's // almost certain that spilling will be required. - if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + if (TM.getOptLevel() == CodeGenOpt::None) HasStackObjects = true; + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); if (ST.isAmdCodeObjectV2(MF)) { if (HasStackObjects) { // If we have stack objects, we unquestionably need the private buffer // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. - unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); - Info->setScratchRSrcReg(PrivateSegmentBufferReg); + Info.setScratchRSrcReg(PrivateSegmentBufferReg); - unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); } else { unsigned ReservedBufferReg - = TRI->reservedPrivateSegmentBufferReg(MF); + = TRI.reservedPrivateSegmentBufferReg(MF); unsigned ReservedOffsetReg - = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); // We tentatively reserve the last registers (skipping the last two // which may contain VCC). After register allocation, we'll replace // these with the ones immediately after those which were really // allocated. In the prologue copies will be inserted from the argument // to these reserved registers. - Info->setScratchRSrcReg(ReservedBufferReg); - Info->setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setScratchRSrcReg(ReservedBufferReg); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } else { - unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); // Without HSA, relocations are used for the scratch pointer and the // buffer resource setup is always inserted in the prologue. Scratch wave // offset is still in an input SGPR. - Info->setScratchRSrcReg(ReservedBufferReg); + Info.setScratchRSrcReg(ReservedBufferReg); if (HasStackObjects) { - unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); } else { unsigned ReservedOffsetReg - = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); - Info->setScratchWaveOffsetReg(ReservedOffsetReg); + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } +} - if (Info->hasWorkItemIDX()) { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); +SDValue SITargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + + MachineFunction &MF = DAG.getMachineFunction(); + FunctionType *FType = MF.getFunction()->getFunctionType(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + + if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { + const Function *Fn = MF.getFunction(); + DiagnosticInfoUnsupported NoGraphicsHSA( + *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); + DAG.getContext()->diagnose(NoGraphicsHSA); + return DAG.getEntryNode(); } - if (Info->hasWorkItemIDY()) { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); + // Create stack objects that are used for emitting debugger prologue if + // "amdgpu-debugger-emit-prologue" attribute was specified. + if (ST.debuggerEmitPrologue()) + createDebuggerPrologueStackObjects(MF); + + SmallVector<ISD::InputArg, 16> Splits; + SmallVector<CCValAssign, 16> ArgLocs; + BitVector Skipped(Ins.size()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + bool IsShader = AMDGPU::isShader(CallConv); + bool IsKernel = AMDGPU::isKernel(CallConv); + bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); + + if (IsShader) { + processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); + + // At least one interpolation mode must be enabled or else the GPU will + // hang. + // + // Check PSInputAddr instead of PSInputEnable. The idea is that if the user + // set PSInputAddr, the user wants to enable some bits after the compilation + // based on run-time states. Since we can't know what the final PSInputEna + // will look like, so we shouldn't do anything here and the user should take + // responsibility for the correct programming. + // + // Otherwise, the following restrictions apply: + // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. + // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be + // enabled too. + if (CallConv == CallingConv::AMDGPU_PS && + ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11)))) { + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->markPSInputEnabled(0); + } + + assert(!Info->hasDispatchPtr() && + !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && + !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && + !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && + !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && + !Info->hasWorkItemIDZ()); + } else { + assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX())); } - if (Info->hasWorkItemIDZ()) { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); + if (IsEntryFunc) { + allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); + } + + if (IsKernel) { + analyzeFormalArgumentsCompute(CCInfo, Ins); + } else { + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); + CCInfo.AnalyzeFormalArguments(Splits, AssignFn); + } + + SmallVector<SDValue, 16> Chains; + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + const ISD::InputArg &Arg = Ins[i]; + if (Skipped[i]) { + InVals.push_back(DAG.getUNDEF(Arg.VT)); + continue; + } + + CCValAssign &VA = ArgLocs[ArgIdx++]; + MVT VT = VA.getLocVT(); + + if (IsEntryFunc && VA.isMemLoc()) { + VT = Ins[i].VT; + EVT MemVT = VA.getLocVT(); + + const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) + + VA.getLocMemOffset(); + Info->setABIArgOffset(Offset + MemVT.getStoreSize()); + + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + SDValue Arg = lowerKernargMemParameter( + DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]); + Chains.push_back(Arg.getValue(1)); + + auto *ParamTy = + dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && + ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // On SI local pointers are just offsets into LDS, so they are always + // less than 16-bits. On CI and newer they could potentially be + // real pointers, so we can't guarantee their size. + Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, + DAG.getValueType(MVT::i16)); + } + + InVals.push_back(Arg); + continue; + } + + if (VA.isMemLoc()) + report_fatal_error("memloc not supported with calling convention"); + + assert(VA.isRegLoc() && "Parameter must be in a register!"); + + unsigned Reg = VA.getLocReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + + Reg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + + if (Arg.VT.isVector()) { + // Build a vector from the registers + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + SmallVector<SDValue, 4> Regs; + Regs.push_back(Val); + for (unsigned j = 1; j != NumElements; ++j) { + Reg = ArgLocs[ArgIdx++].getLocReg(); + Reg = MF.addLiveIn(Reg, RC); + + SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); + Regs.push_back(Copy); + } + + // Fill up the missing vector elements + NumElements = Arg.VT.getVectorNumElements() - NumElements; + Regs.append(NumElements, DAG.getUNDEF(VT)); + + InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); + continue; + } + + InVals.push_back(Val); } - if (Chains.empty()) - return Chain; + // Start adding system SGPRs. + if (IsEntryFunc) + allocateSystemSGPRs(CCInfo, MF, *Info, IsShader); + + reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + return Chains.empty() ? Chain : + DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } SDValue @@ -1197,7 +1398,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, if (Flag.getNode()) RetOps.push_back(Flag); - unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN; + unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG; return DAG.getNode(Opc, DL, MVT::Other, RetOps); } @@ -1470,16 +1671,16 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; if (Offset == 0) { MachineInstr *SetOn = - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .addOperand(*Idx) - .addImm(IdxMode); + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) + .add(*Idx) + .addImm(IdxMode); SetOn->getOperand(3).setIsUndef(); } else { unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) - .addOperand(*Idx) - .addImm(Offset); + .add(*Idx) + .addImm(Offset); MachineInstr *SetOn = BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) .addReg(Tmp, RegState::Kill) @@ -1493,10 +1694,10 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, if (Offset == 0) { BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addOperand(*Idx); + .add(*Idx); } else { BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addOperand(*Idx) + .add(*Idx) .addImm(Offset); } @@ -1522,7 +1723,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); - bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; + bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { MachineBasicBlock::iterator I(&MI); @@ -1548,7 +1749,6 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, return &MBB; } - const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); @@ -1625,7 +1825,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset); - bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; + bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); if (Idx->getReg() == AMDGPU::NoRegister) { MachineBasicBlock::iterator I(&MI); @@ -1634,9 +1834,9 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, assert(Offset == 0); BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) - .addOperand(*SrcVec) - .addOperand(*Val) - .addImm(SubReg); + .add(*SrcVec) + .add(*Val) + .addImm(SubReg); MI.eraseFromParent(); return &MBB; @@ -1648,11 +1848,11 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, if (UseGPRIdxMode) { BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) - .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst - .addOperand(*Val) - .addReg(Dst, RegState::ImplicitDefine) - .addReg(SrcVec->getReg(), RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); + .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst + .add(*Val) + .addReg(Dst, RegState::ImplicitDefine) + .addReg(SrcVec->getReg(), RegState::Implicit) + .addReg(AMDGPU::M0, RegState::Implicit); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { @@ -1661,7 +1861,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, BuildMI(MBB, I, DL, MovRelDesc) .addReg(Dst, RegState::Define) .addReg(SrcVec->getReg()) - .addOperand(*Val) + .add(*Val) .addImm(SubReg - AMDGPU::sub0); } @@ -1694,18 +1894,18 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, if (UseGPRIdxMode) { BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) - .addReg(PhiReg, RegState::Undef, SubReg) // vdst - .addOperand(*Val) // src0 - .addReg(Dst, RegState::ImplicitDefine) - .addReg(PhiReg, RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); + .addReg(PhiReg, RegState::Undef, SubReg) // vdst + .add(*Val) // src0 + .addReg(Dst, RegState::ImplicitDefine) + .addReg(PhiReg, RegState::Implicit) + .addReg(AMDGPU::M0, RegState::Implicit); } else { const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); BuildMI(*LoopBB, InsPt, DL, MovRelDesc) .addReg(Dst, RegState::Define) .addReg(PhiReg) - .addOperand(*Val) + .add(*Val) .addImm(SubReg - AMDGPU::sub0); } @@ -1741,18 +1941,62 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( } switch (MI.getOpcode()) { - case AMDGPU::SI_INIT_M0: { + case AMDGPU::S_TRAP_PSEUDO: { + const DebugLoc &DL = MI.getDebugLoc(); + const int TrapType = MI.getOperand(0).getImm(); + + if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa && + Subtarget->isTrapHandlerEnabled()) { + + MachineFunction *MF = BB->getParent(); + SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); + unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + assert(UserSGPR != AMDGPU::NoRegister); + + if (!BB->isLiveIn(UserSGPR)) + BB->addLiveIn(UserSGPR); + + BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::SGPR0_SGPR1) + .addReg(UserSGPR); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_TRAP)) + .addImm(TrapType) + .addReg(AMDGPU::SGPR0_SGPR1, RegState::Implicit); + } else { + switch (TrapType) { + case SISubtarget::TrapIDLLVMTrap: + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_ENDPGM)); + break; + case SISubtarget::TrapIDLLVMDebugTrap: { + DiagnosticInfoUnsupported NoTrap(*MF->getFunction(), + "debugtrap handler not supported", + DL, + DS_Warning); + LLVMContext &C = MF->getFunction()->getContext(); + C.diagnose(NoTrap); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_NOP)) + .addImm(0); + break; + } + default: + llvm_unreachable("unsupported trap handler type!"); + } + } + + MI.eraseFromParent(); + return BB; + } + case AMDGPU::SI_INIT_M0: BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addOperand(MI.getOperand(0)); + .add(MI.getOperand(0)); MI.eraseFromParent(); return BB; - } + case AMDGPU::GET_GROUPSTATICSIZE: { DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) - .addOperand(MI.getOperand(0)) - .addImm(MFI->getLDSSize()); + .add(MI.getOperand(0)) + .addImm(MFI->getLDSSize()); MI.eraseFromParent(); return BB; } @@ -1803,7 +2047,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) - .addOperand(MI.getOperand(0)); + .add(MI.getOperand(0)); Br->getOperand(1).setIsUndef(true); // read undef SCC MI.eraseFromParent(); return BB; @@ -1856,9 +2100,6 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); - if (!VT.isSimple()) - return false; - switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: // This is as fast on some subtargets. However, we always have full rate f32 @@ -1909,13 +2150,52 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); - case ISD::TRAP: return lowerTRAP(Op, DAG); + case ISD::INSERT_VECTOR_ELT: + return lowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: + return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); } return SDValue(); } +void SITargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + case ISD::INSERT_VECTOR_ELT: { + if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG)) + Results.push_back(Res); + return; + } + case ISD::EXTRACT_VECTOR_ELT: { + if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG)) + Results.push_back(Res); + return; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_cvt_pkrtz: { + SDValue Src0 = N->getOperand(1); + SDValue Src1 = N->getOperand(2); + SDLoc SL(N); + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, + Src0, Src1); + + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); + return; + } + default: + break; + } + } + default: + break; + } +} + /// \brief Helper function for LowerBRCOND static SDNode *findUser(SDValue Value, unsigned Opcode) { @@ -1932,31 +2212,25 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { return nullptr; } -bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { +unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { - case AMDGPUIntrinsic::amdgcn_if: - case AMDGPUIntrinsic::amdgcn_else: - case AMDGPUIntrinsic::amdgcn_end_cf: - case AMDGPUIntrinsic::amdgcn_loop: - return true; + case Intrinsic::amdgcn_if: + return AMDGPUISD::IF; + case Intrinsic::amdgcn_else: + return AMDGPUISD::ELSE; + case Intrinsic::amdgcn_loop: + return AMDGPUISD::LOOP; + case Intrinsic::amdgcn_end_cf: + llvm_unreachable("should not occur"); default: - return false; + return 0; } } - if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) { - switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) { - case AMDGPUIntrinsic::amdgcn_break: - case AMDGPUIntrinsic::amdgcn_if_break: - case AMDGPUIntrinsic::amdgcn_else_break: - return true; - default: - return false; - } - } - - return false; + // break, if_break, else_break are all only used as inputs to loop, not + // directly as branch conditions. + return 0; } void SITargetLowering::createDebuggerPrologueStackObjects( @@ -1987,13 +2261,13 @@ void SITargetLowering::createDebuggerPrologueStackObjects( bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); - return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && AMDGPU::shouldEmitConstantsToTextSection(TT); } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { - return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && + return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -2006,7 +2280,6 @@ bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const { - SDLoc DL(BRCOND); SDNode *Intr = BRCOND.getOperand(1).getNode(); @@ -2032,7 +2305,8 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088> - if (!isCFIntrinsic(Intr)) { + unsigned CFNode = isCFIntrinsic(Intr); + if (CFNode == 0) { // This is a uniform branch so we don't need to legalize. return BRCOND; } @@ -2050,15 +2324,13 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, if (HaveChain) Ops.push_back(BRCOND.getOperand(0)); - Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end()); + Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end()); Ops.push_back(Target); ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); // build the new intrinsic call - SDNode *Result = DAG.getNode( - Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, - DAG.getVTList(Res), Ops).getNode(); + SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode(); if (!HaveChain) { SDValue Ops[] = { @@ -2130,9 +2402,28 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);; } -SDValue SITargetLowering::getSegmentAperture(unsigned AS, +SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const { - SDLoc SL; + // FIXME: Use inline constants (src_{shared, private}_base) instead. + if (Subtarget->hasApertureRegs()) { + unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ? + AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : + AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; + unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ? + AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : + AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; + unsigned Encoding = + AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | + Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | + WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; + + SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16); + SDValue ApertureReg = SDValue( + DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0); + SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32); + return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount); + } + MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); unsigned UserSGPR = Info->getQueuePtrUserSGPR(); @@ -2143,19 +2434,19 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, // Offset into amd_queue_t for group_segment_aperture_base_hi / // private_segment_aperture_base_hi. - uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; + uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44; - SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, - DAG.getConstant(StructOffset, SL, MVT::i64)); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr, + DAG.getConstant(StructOffset, DL, MVT::i64)); // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not // be available and how do we get it? Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), - AMDGPUAS::CONSTANT_ADDRESS)); + AMDGPUASI.CONSTANT_ADDRESS)); MachinePointerInfo PtrInfo(V, StructOffset); - return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, + return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, MinAlign(64, StructOffset), MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); @@ -2167,15 +2458,19 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op); SDValue Src = ASC->getOperand(0); - - // FIXME: Really support non-0 null pointers. - SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32); SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); + const AMDGPUTargetMachine &TM = + static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); + // flat -> local/private - if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { - if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) { + unsigned DestAS = ASC->getDestAddressSpace(); + + if (DestAS == AMDGPUASI.LOCAL_ADDRESS || + DestAS == AMDGPUASI.PRIVATE_ADDRESS) { + unsigned NullVal = TM.getNullPointerValue(DestAS); + SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); @@ -2185,13 +2480,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, } // local/private -> flat - if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { - if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) { + unsigned SrcAS = ASC->getSrcAddressSpace(); + + if (SrcAS == AMDGPUASI.LOCAL_ADDRESS || + SrcAS == AMDGPUASI.PRIVATE_ADDRESS) { + unsigned NullVal = TM.getNullPointerValue(SrcAS); + SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); + SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); - SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); SDValue CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); @@ -2211,17 +2511,88 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, return DAG.getUNDEF(ASC->getValueType(0)); } +SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Idx = Op.getOperand(2); + if (isa<ConstantSDNode>(Idx)) + return SDValue(); + + // Avoid stack access for dynamic indexing. + SDLoc SL(Op); + SDValue Vec = Op.getOperand(0); + SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1)); + + // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec + SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val); + + // Convert vector index to bit-index. + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, + DAG.getConstant(16, SL, MVT::i32)); + + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + + SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32, + DAG.getConstant(0xffff, SL, MVT::i32), + ScaledIdx); + + SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal); + SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32, + DAG.getNOT(SL, BFM, MVT::i32), BCVec); + + SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS); + return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI); +} + +SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + + EVT ResultVT = Op.getValueType(); + SDValue Vec = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + + if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) { + SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + + if (CIdx->getZExtValue() == 1) { + Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result, + DAG.getConstant(16, SL, MVT::i32)); + } else { + assert(CIdx->getZExtValue() == 0); + } + + if (ResultVT.bitsLT(MVT::i32)) + Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); + return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); + } + + SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32); + + // Convert vector index to bit-index. + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx); + + SDValue Result = Elt; + if (ResultVT.bitsLT(MVT::i32)) + Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); + + return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); +} + bool SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. - return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && + return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && !shouldEmitGOTReloc(GA->getGlobal()); } -static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, - SDLoc DL, unsigned Offset, EVT PtrVT, - unsigned GAFlags = SIInstrInfo::MO_NONE) { +static SDValue +buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, + const SDLoc &DL, unsigned Offset, EVT PtrVT, + unsigned GAFlags = SIInstrInfo::MO_NONE) { // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is // lowered to the following code sequence: // @@ -2265,8 +2636,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); - if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && - GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) + if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && + GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); SDLoc DL(GSD); @@ -2283,7 +2654,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SIInstrInfo::MO_GOTPCREL32); Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); const DataLayout &DataLayout = DAG.getDataLayout(); unsigned Align = DataLayout.getABITypeAlignment(PtrTy); // FIXME: Use a PseudoSourceValue once those can be assigned an address space. @@ -2294,23 +2665,6 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, MachineMemOperand::MOInvariant); } -SDValue SITargetLowering::lowerTRAP(SDValue Op, - SelectionDAG &DAG) const { - const MachineFunction &MF = DAG.getMachineFunction(); - DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), - "trap handler not supported", - Op.getDebugLoc(), - DS_Warning); - DAG.getContext()->diagnose(NoTrap); - - // Emit s_endpgm. - - // FIXME: This should really be selected to s_trap, but that requires - // setting up the trap handler for it o do anything. - return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, - Op.getOperand(0)); -} - SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const { // We can't use S_MOV_B32 directly, because there is no way to specify m0 as @@ -2332,14 +2686,15 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, MVT VT, unsigned Offset) const { SDLoc SL(Op); - SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, false); + SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, + DAG.getEntryNode(), Offset, false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); } -static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { +static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, + EVT VT) { DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), "non-hsa intrinsic with hsa target", DL.getDebugLoc()); @@ -2347,7 +2702,8 @@ static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { return DAG.getUNDEF(VT); } -static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { +static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, + EVT VT) { DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), "intrinsic not supported on subtarget", DL.getDebugLoc()); @@ -2389,7 +2745,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_implicitarg_ptr: { unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); - return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset); + return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset); } case Intrinsic::amdgcn_kernarg_segment_ptr: { unsigned Reg @@ -2403,19 +2759,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_rcp: return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_rsq: - case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - case Intrinsic::amdgcn_rsq_legacy: { + case Intrinsic::amdgcn_rsq_legacy: if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - } - case Intrinsic::amdgcn_rcp_legacy: { + case Intrinsic::amdgcn_rcp_legacy: if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); - } case Intrinsic::amdgcn_rsq_clamp: { if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); @@ -2434,38 +2787,38 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_X, false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Y, false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Z, false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_X, false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); @@ -2522,43 +2875,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } - case AMDGPUIntrinsic::amdgcn_fdiv_fast: { + case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); - } - case AMDGPUIntrinsic::SI_vs_load_input: - return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::SI_fs_constant: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, - DAG.getConstant(2, DL, MVT::i32), // P0 - Op.getOperand(1), Op.getOperand(2), Glue); - } - case AMDGPUIntrinsic::SI_packf16: - if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) - return DAG.getUNDEF(MVT::i32); - return Op; - case AMDGPUIntrinsic::SI_fs_interp: { - SDValue IJ = Op.getOperand(4); - SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, - DAG.getConstant(0, DL, MVT::i32)); - SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, - DAG.getConstant(1, DL, MVT::i32)); - I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I); - J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J); - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); - SDValue Glue = M0.getValue(1); - SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, - DAG.getVTList(MVT::f32, MVT::Glue), - I, Op.getOperand(1), Op.getOperand(2), Glue); - Glue = SDValue(P1.getNode(), 1); - return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, - Op.getOperand(1), Op.getOperand(2), Glue); - } case Intrinsic::amdgcn_interp_mov: { SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); SDValue Glue = M0.getValue(1); @@ -2639,10 +2957,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_icmp: { const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); - int CondCode = CD->getSExtValue(); + if (!CD) + return DAG.getUNDEF(VT); + int CondCode = CD->getSExtValue(); if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || - CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE) + CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE) return DAG.getUNDEF(VT); ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); @@ -2652,10 +2972,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_fcmp: { const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); - int CondCode = CD->getSExtValue(); + if (!CD) + return DAG.getUNDEF(VT); - if (CondCode <= FCmpInst::Predicate::FCMP_FALSE || - CondCode >= FCmpInst::Predicate::FCMP_TRUE) + int CondCode = CD->getSExtValue(); + if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE || + CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) return DAG.getUNDEF(VT); FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); @@ -2663,14 +2985,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), Op.getOperand(2), DAG.getCondCode(CCOpcode)); } + case Intrinsic::amdgcn_fmed3: + return DAG.getNode(AMDGPUISD::FMED3, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_sffbh: - case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name. return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_sbfe: + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_ubfe: + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_cvt_pkrtz: { + // FIXME: Stop adding cast if v2f16 legal. + EVT VT = Op.getValueType(); + SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32, + Op.getOperand(1), Op.getOperand(2)); + return DAG.getNode(ISD::BITCAST, DL, VT, Node); + } default: - return AMDGPUTargetLowering::LowerOperation(Op, DAG); + return Op; } } @@ -2718,6 +3055,64 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); } + // Basic sample. + case Intrinsic::amdgcn_image_sample: + case Intrinsic::amdgcn_image_sample_cl: + case Intrinsic::amdgcn_image_sample_d: + case Intrinsic::amdgcn_image_sample_d_cl: + case Intrinsic::amdgcn_image_sample_l: + case Intrinsic::amdgcn_image_sample_b: + case Intrinsic::amdgcn_image_sample_b_cl: + case Intrinsic::amdgcn_image_sample_lz: + case Intrinsic::amdgcn_image_sample_cd: + case Intrinsic::amdgcn_image_sample_cd_cl: + + // Sample with comparison. + case Intrinsic::amdgcn_image_sample_c: + case Intrinsic::amdgcn_image_sample_c_cl: + case Intrinsic::amdgcn_image_sample_c_d: + case Intrinsic::amdgcn_image_sample_c_d_cl: + case Intrinsic::amdgcn_image_sample_c_l: + case Intrinsic::amdgcn_image_sample_c_b: + case Intrinsic::amdgcn_image_sample_c_b_cl: + case Intrinsic::amdgcn_image_sample_c_lz: + case Intrinsic::amdgcn_image_sample_c_cd: + case Intrinsic::amdgcn_image_sample_c_cd_cl: + + // Sample with offsets. + case Intrinsic::amdgcn_image_sample_o: + case Intrinsic::amdgcn_image_sample_cl_o: + case Intrinsic::amdgcn_image_sample_d_o: + case Intrinsic::amdgcn_image_sample_d_cl_o: + case Intrinsic::amdgcn_image_sample_l_o: + case Intrinsic::amdgcn_image_sample_b_o: + case Intrinsic::amdgcn_image_sample_b_cl_o: + case Intrinsic::amdgcn_image_sample_lz_o: + case Intrinsic::amdgcn_image_sample_cd_o: + case Intrinsic::amdgcn_image_sample_cd_cl_o: + + // Sample with comparison and offsets. + case Intrinsic::amdgcn_image_sample_c_o: + case Intrinsic::amdgcn_image_sample_c_cl_o: + case Intrinsic::amdgcn_image_sample_c_d_o: + case Intrinsic::amdgcn_image_sample_c_d_cl_o: + case Intrinsic::amdgcn_image_sample_c_l_o: + case Intrinsic::amdgcn_image_sample_c_b_o: + case Intrinsic::amdgcn_image_sample_c_b_cl_o: + case Intrinsic::amdgcn_image_sample_c_lz_o: + case Intrinsic::amdgcn_image_sample_c_cd_o: + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + case Intrinsic::amdgcn_image_getlod: { + // Replace dmask with everything disabled with undef. + const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5)); + if (!DMask || DMask->isNullValue()) { + SDValue Undef = DAG.getUNDEF(Op.getValueType()); + return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op)); + } + + return SDValue(); + } default: return SDValue(); } @@ -2731,17 +3126,60 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { - case AMDGPUIntrinsic::SI_sendmsg: - case Intrinsic::amdgcn_s_sendmsg: { - Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); - SDValue Glue = Chain.getValue(1); - return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, - Op.getOperand(2), Glue); + case Intrinsic::amdgcn_exp: { + const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2)); + const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3)); + const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8)); + const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9)); + + const SDValue Ops[] = { + Chain, + DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt + DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en + Op.getOperand(4), // src0 + Op.getOperand(5), // src1 + Op.getOperand(6), // src2 + Op.getOperand(7), // src3 + DAG.getTargetConstant(0, DL, MVT::i1), // compr + DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) + }; + + unsigned Opc = Done->isNullValue() ? + AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; + return DAG.getNode(Opc, DL, Op->getVTList(), Ops); + } + case Intrinsic::amdgcn_exp_compr: { + const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2)); + const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3)); + SDValue Src0 = Op.getOperand(4); + SDValue Src1 = Op.getOperand(5); + const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6)); + const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7)); + + SDValue Undef = DAG.getUNDEF(MVT::f32); + const SDValue Ops[] = { + Chain, + DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt + DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), + Undef, // src2 + Undef, // src3 + DAG.getTargetConstant(1, DL, MVT::i1), // compr + DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) + }; + + unsigned Opc = Done->isNullValue() ? + AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; + return DAG.getNode(Opc, DL, Op->getVTList(), Ops); } + case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { + unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ? + AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT; Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); SDValue Glue = Chain.getValue(1); - return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain, + return DAG.getNode(NodeOp, DL, MVT::Other, Chain, Op.getOperand(2), Glue); } case AMDGPUIntrinsic::SI_tbuffer_store: { @@ -2784,31 +3222,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); } - case AMDGPUIntrinsic::SI_export: { - const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2)); - const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3)); - const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4)); - const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5)); - const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6)); - - const SDValue Ops[] = { - Chain, - DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), - DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1), - DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), - DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1), - Op.getOperand(7), // src0 - Op.getOperand(8), // src1 - Op.getOperand(9), // src2 - Op.getOperand(10) // src3 - }; - - unsigned Opc = Done->isNullValue() ? - AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; - return DAG.getNode(Opc, DL, Op->getVTList(), Ops); - } - default: + case Intrinsic::amdgcn_s_barrier: { + if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { + const MachineFunction &MF = DAG.getMachineFunction(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second; + if (WGSize <= ST.getWavefrontSize()) + return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, + Op.getOperand(0)), 0); + } return SDValue(); + }; + default: + return Op; } } @@ -2857,21 +3283,20 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. - if (AS == AMDGPUAS::FLAT_ADDRESS) + if (AS == AMDGPUASI.FLAT_ADDRESS) AS = MFI->hasFlatScratchInit() ? - AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; + AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = MemVT.getVectorNumElements(); - switch (AS) { - case AMDGPUAS::CONSTANT_ADDRESS: + if (AS == AMDGPUASI.CONSTANT_ADDRESS) { if (isMemOpUniform(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. // - LLVM_FALLTHROUGH; - case AMDGPUAS::GLOBAL_ADDRESS: { + } + if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); @@ -2880,13 +3305,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // loads. // } - LLVM_FALLTHROUGH; - case AMDGPUAS::FLAT_ADDRESS: + if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); // v4 loads are supported for private and global memory. return SDValue(); - case AMDGPUAS::PRIVATE_ADDRESS: { + } + if (AS == AMDGPUASI.PRIVATE_ADDRESS) { // Depending on the setting of the private_element_size field in the // resource descriptor, we can only make private accesses up to a certain // size. @@ -2905,8 +3331,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("unsupported private_element_size"); } - } - case AMDGPUAS::LOCAL_ADDRESS: { + } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { if (NumElements > 2) return SplitVectorLoad(Op, DAG); @@ -2916,9 +3341,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // If properly aligned, if we split we might be able to use ds_read_b64. return SplitVectorLoad(Op, DAG); } - default: - return SDValue(); - } + return SDValue(); } SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -3287,18 +3710,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. - if (AS == AMDGPUAS::FLAT_ADDRESS) + if (AS == AMDGPUASI.FLAT_ADDRESS) AS = MFI->hasFlatScratchInit() ? - AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; + AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = VT.getVectorNumElements(); - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::FLAT_ADDRESS: + if (AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorStore(Op, DAG); return SDValue(); - case AMDGPUAS::PRIVATE_ADDRESS: { + } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { case 4: return scalarizeVectorStore(Store, DAG); @@ -3313,8 +3735,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("unsupported private_element_size"); } - } - case AMDGPUAS::LOCAL_ADDRESS: { + } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { if (NumElements > 2) return SplitVectorStore(Op, DAG); @@ -3323,8 +3744,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // If properly aligned, if we split we might be able to use ds_write_b64. return SplitVectorStore(Op, DAG); - } - default: + } else { llvm_unreachable("unhandled address space"); } } @@ -3355,7 +3775,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co unsigned AS = AtomicNode->getAddressSpace(); // No custom lowering required for local address space - if (!isFlatGlobalAddrSpace(AS)) + if (!isFlatGlobalAddrSpace(AS, AMDGPUASI)) return Op; // Non-local address space requires custom lowering for atomic compare @@ -3412,12 +3832,12 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, /// the immediate offsets of a memory instruction for the given address space. static bool canFoldOffset(unsigned OffsetSize, unsigned AS, const SISubtarget &STI) { - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: { + auto AMDGPUASI = STI.getAMDGPUAS(); + if (AS == AMDGPUASI.GLOBAL_ADDRESS) { // MUBUF instructions a 12-bit offset in bytes. return isUInt<12>(OffsetSize); } - case AMDGPUAS::CONSTANT_ADDRESS: { + if (AS == AMDGPUASI.CONSTANT_ADDRESS) { // SMRD instructions have an 8-bit offset in dwords on SI and // a 20-bit offset in bytes on VI. if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) @@ -3425,16 +3845,13 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS, else return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); } - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: { + if (AS == AMDGPUASI.LOCAL_ADDRESS || + AS == AMDGPUASI.REGION_ADDRESS) { // The single offset versions have a 16-bit offset in bytes. return isUInt<16>(OffsetSize); } - case AMDGPUAS::PRIVATE_ADDRESS: // Indirect register addressing does not use any offsets. - default: - return 0; - } + return false; } // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) @@ -3492,7 +3909,7 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, // TODO: We could also do this for multiplies. unsigned AS = N->getAddressSpace(); - if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { + if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) { SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); if (NewPtr) { SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); @@ -3692,6 +4109,88 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, return SDValue(); } +// Instructions that will be lowered with a final instruction that zeros the +// high result bits. +// XXX - probably only need to list legal operations. +static bool fp16SrcZerosHighBits(unsigned Opc) { + switch (Opc) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: + case ISD::FMA: + case ISD::FMAD: + case ISD::FCANONICALIZE: + case ISD::FP_ROUND: + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: + case ISD::FABS: + // Fabs is lowered to a bit operation, but it's an and which will clear the + // high bits anyway. + case ISD::FSQRT: + case ISD::FSIN: + case ISD::FCOS: + case ISD::FPOWI: + case ISD::FPOW: + case ISD::FLOG: + case ISD::FLOG2: + case ISD::FLOG10: + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case AMDGPUISD::FRACT: + case AMDGPUISD::CLAMP: + case AMDGPUISD::COS_HW: + case AMDGPUISD::SIN_HW: + case AMDGPUISD::FMIN3: + case AMDGPUISD::FMAX3: + case AMDGPUISD::FMED3: + case AMDGPUISD::FMAD_FTZ: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::LDEXP: + return true; + default: + // fcopysign, select and others may be lowered to 32-bit bit operations + // which don't zero the high bits. + return false; + } +} + +SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!Subtarget->has16BitInsts() || + DCI.getDAGCombineLevel() < AfterLegalizeDAG) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + return SDValue(); + + SDValue Src = N->getOperand(0); + if (Src.getValueType() != MVT::i16) + return SDValue(); + + // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src + // FIXME: It is not universally true that the high bits are zeroed on gfx9. + if (Src.getOpcode() == ISD::BITCAST) { + SDValue BCSrc = Src.getOperand(0); + if (BCSrc.getValueType() == MVT::f16 && + fp16SrcZerosHighBits(BCSrc.getOpcode())) + return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc); + } + + return SDValue(); +} + SDValue SITargetLowering::performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -3713,7 +4212,7 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, SDValue SITargetLowering::performFCanonicalizeCombine( SDNode *N, DAGCombinerInfo &DCI) const { - ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0)); if (!CFP) return SDValue(); @@ -3723,13 +4222,14 @@ SDValue SITargetLowering::performFCanonicalizeCombine( // Flush denormals to 0 if not enabled. if (C.isDenormal()) { EVT VT = N->getValueType(0); - if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) + EVT SVT = VT.getScalarType(); + if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals()) return DAG.getConstantFP(0.0, SDLoc(N), VT); - if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) + if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals()) return DAG.getConstantFP(0.0, SDLoc(N), VT); - if (VT == MVT::f16 && !Subtarget->hasFP16Denormals()) + if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals()) return DAG.getConstantFP(0.0, SDLoc(N), VT); } @@ -3749,7 +4249,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine( return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); } - return SDValue(CFP, 0); + return N->getOperand(0); } static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { @@ -3771,8 +4271,9 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { } } -static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) { +SDValue SITargetLowering::performIntMed3ImmCombine( + SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) const { ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); if (!K1) return SDValue(); @@ -3790,23 +4291,22 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, } EVT VT = K0->getValueType(0); + unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; + if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) { + return DAG.getNode(Med3Opc, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + } + // If there isn't a 16-bit med3 operation, convert to 32-bit. MVT NVT = MVT::i32; unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - SDValue Tmp1, Tmp2, Tmp3; - Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - - if (VT == MVT::i16) { - Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT, - Tmp1, Tmp2, Tmp3); + SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); + SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); + SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1); - } else - return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); + return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); } static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { @@ -3816,8 +4316,10 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { return DAG.isKnownNeverNaN(Op); } -static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1) { +SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Op0, + SDValue Op1) const { ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); if (!K1) return SDValue(); @@ -3831,6 +4333,20 @@ static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, if (Cmp == APFloat::cmpGreaterThan) return SDValue(); + // TODO: Check IEEE bit enabled? + EVT VT = K0->getValueType(0); + if (Subtarget->enableDX10Clamp()) { + // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the + // hardware fmed3 behavior converting to a min. + // FIXME: Should this be allowing -0.0? + if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); + } + + // med3 for f16 is only available on gfx9+. + if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16())) + return SDValue(); + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then // give the other result, which is different from med3 with a NaN input. @@ -3846,6 +4362,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); unsigned Opc = N->getOpcode(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -3853,7 +4370,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { + + if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && + VT != MVT::f64) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { @@ -3895,7 +4414,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && - N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + (VT == MVT::f32 || VT == MVT::f64 || + (VT == MVT::f16 && Subtarget->has16BitInsts())) && + Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) return Res; } @@ -3903,6 +4424,69 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, return SDValue(); } +static bool isClampZeroToOne(SDValue A, SDValue B) { + if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) { + if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) { + // FIXME: Should this be allowing -0.0? + return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) || + (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0)); + } + } + + return false; +} + +// FIXME: Should only worry about snans for version with chain. +SDValue SITargetLowering::performFMed3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and + // NaNs. With a NaN input, the order of the operands may change the result. + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + SDValue Src2 = N->getOperand(2); + + if (isClampZeroToOne(Src0, Src1)) { + // const_a, const_b, x -> clamp is safe in all cases including signaling + // nans. + // FIXME: Should this be allowing -0.0? + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); + } + + // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother + // handling no dx10-clamp? + if (Subtarget->enableDX10Clamp()) { + // If NaNs is clamped to 0, we are free to reorder the inputs. + + if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) + std::swap(Src0, Src1); + + if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2)) + std::swap(Src1, Src2); + + if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) + std::swap(Src0, Src1); + + if (isClampZeroToOne(Src1, Src2)) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0); + } + + return SDValue(); +} + +SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + if (Src0.isUndef() && Src1.isUndef()) + return DCI.DAG.getUNDEF(N->getValueType(0)); + return SDValue(); +} + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -3933,7 +4517,6 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); - assert(!VT.isVector()); SDLoc SL(N); SDValue LHS = N->getOperand(0); @@ -4112,7 +4695,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && - N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) return performMinMaxCombine(N, DCI); break; @@ -4135,17 +4717,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case AMDGPUISD::ATOMIC_INC: - case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. + case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; return performMemSDNodeCombine(cast<MemSDNode>(N), DCI); - } case ISD::AND: return performAndCombine(N, DCI); case ISD::OR: return performOrCombine(N, DCI); case ISD::XOR: return performXorCombine(N, DCI); + case ISD::ZERO_EXTEND: + return performZeroExtendCombine(N, DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); case ISD::FCANONICALIZE: @@ -4170,6 +4753,28 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::CVT_F32_UBYTE2: case AMDGPUISD::CVT_F32_UBYTE3: return performCvtF32UByteNCombine(N, DCI); + case AMDGPUISD::FMED3: + return performFMed3Combine(N, DCI); + case AMDGPUISD::CVT_PKRTZ_F16_F32: + return performCvtPkRTZCombine(N, DCI); + case ISD::SCALAR_TO_VECTOR: { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) + if (VT == MVT::v2i16 || VT == MVT::v2f16) { + SDLoc SL(N); + SDValue Src = N->getOperand(0); + EVT EltVT = Src.getValueType(); + if (EltVT == MVT::f16) + Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src); + + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src); + return DAG.getNode(ISD::BITCAST, SL, VT, Ext); + } + + break; + } } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } @@ -4198,6 +4803,10 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); I != E; ++I) { + // Don't look at users of the chain. + if (I.getUse().getResNo() != 0) + continue; + // Abort if we can't understand the usage if (!I->isMachineOpcode() || I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) @@ -4250,7 +4859,6 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Update the users of the node with the new indices for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { - SDNode *User = Users[i]; if (!User) continue; @@ -4277,8 +4885,33 @@ static bool isFrameIndexOp(SDValue Op) { /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) /// with frame index operands. /// LLVM assumes that inputs are to these instructions are registers. -void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, - SelectionDAG &DAG) const { +SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, + SelectionDAG &DAG) const { + if (Node->getOpcode() == ISD::CopyToReg) { + RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1)); + SDValue SrcVal = Node->getOperand(2); + + // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have + // to try understanding copies to physical registers. + if (SrcVal.getValueType() == MVT::i1 && + TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) { + SDLoc SL(Node); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + SDValue VReg = DAG.getRegister( + MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1); + + SDNode *Glued = Node->getGluedNode(); + SDValue ToVReg + = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal, + SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); + SDValue ToResultReg + = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0), + VReg, ToVReg.getValue(1)); + DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode()); + DAG.RemoveDeadNode(Node); + return ToResultReg.getNode(); + } + } SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < Node->getNumOperands(); ++i) { @@ -4294,6 +4927,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, } DAG.UpdateNodeOperands(Node, Ops); + return Node; } /// \brief Fold the instructions after selecting them. @@ -4496,6 +5130,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &AMDGPU::SReg_128RegClass); case 256: return std::make_pair(0U, &AMDGPU::SReg_256RegClass); + case 512: + return std::make_pair(0U, &AMDGPU::SReg_512RegClass); } case 'v': diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 6c04e4f30977..d177777ad5ee 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -21,11 +21,13 @@ namespace llvm { class SITargetLowering final : public AMDGPUTargetLowering { - SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, - unsigned Offset) const; - SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, - SDValue Chain, unsigned Offset, bool Signed, - const ISD::InputArg *Arg = nullptr) const; + SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, + SDValue Chain, uint64_t Offset) const; + SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Chain, + uint64_t Offset, bool Signed, + const ISD::InputArg *Arg = nullptr) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, @@ -55,11 +57,19 @@ class SITargetLowering final : public AMDGPUTargetLowering { const SDLoc &DL, EVT VT) const; + SDValue convertArgType( + SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, + bool Signed, const ISD::InputArg *Arg = nullptr) const; + /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; - SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, + SelectionDAG &DAG) const; + SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; @@ -79,10 +89,17 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1) const; + SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; @@ -94,7 +111,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { bool isLegalFlatAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; - bool isCFIntrinsic(const SDNode *Intr) const; + unsigned isCFIntrinsic(const SDNode *Intr) const; void createDebuggerPrologueStackObjects(MachineFunction &MF) const; @@ -115,11 +132,15 @@ public: const SISubtarget *getSubtarget() const; + bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, + EVT /*VT*/) const override; + bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, unsigned IntrinsicID) const override; - bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, - EVT /*VT*/) const override; + bool getAddrModeArguments(IntrinsicInst * /*I*/, + SmallVectorImpl<Value*> &/*Ops*/, + Type *&/*AccessTy*/) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; @@ -175,6 +196,9 @@ public: MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; void AdjustInstrPostInstrSelection(MachineInstr &MI, @@ -182,7 +206,7 @@ public: SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const override; - void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; + SDNode *legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const; diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp index 91e4bf755c53..ba346d2fad02 100644 --- a/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -1,4 +1,4 @@ -//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===// +//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// // // The LLVM Compiler Infrastructure // @@ -12,33 +12,46 @@ /// branches when it's expected that jumping over the untaken control flow will /// be cheaper than having every workitem no-op through it. // +//===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetMachine.h" +#include <cassert> +#include <cstdint> +#include <iterator> using namespace llvm; #define DEBUG_TYPE "si-insert-skips" -namespace { - static cl::opt<unsigned> SkipThresholdFlag( "amdgpu-skip-threshold", cl::desc("Number of instructions before jumping over divergent control flow"), cl::init(12), cl::Hidden); +namespace { + class SIInsertSkips : public MachineFunctionPass { private: - const SIRegisterInfo *TRI; - const SIInstrInfo *TII; - unsigned SkipThreshold; + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + unsigned SkipThreshold = 0; bool shouldSkip(const MachineBasicBlock &From, const MachineBasicBlock &To) const; @@ -55,8 +68,7 @@ private: public: static char ID; - SIInsertSkips() : - MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { } + SIInsertSkips() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -69,7 +81,7 @@ public: } }; -} // End anonymous namespace +} // end anonymous namespace char SIInsertSkips::ID = 0; @@ -195,8 +207,8 @@ void SIInsertSkips::kill(MachineInstr &MI) { } } else { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) - .addImm(0) - .addOperand(Op); + .addImm(0) + .add(Op); } } @@ -251,6 +263,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; + bool HaveSkipBlock = false; if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { // Reached convergence point for last divergent branch. @@ -270,27 +283,33 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; switch (MI.getOpcode()) { - case AMDGPU::SI_MASK_BRANCH: { + case AMDGPU::SI_MASK_BRANCH: ExecBranchStack.push_back(MI.getOperand(0).getMBB()); MadeChange |= skipMaskBranch(MI, MBB); break; - } - case AMDGPU::S_BRANCH: { + + case AMDGPU::S_BRANCH: // Optimize out branches to the next block. // FIXME: Shouldn't this be handled by BranchFolding? - if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) + if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { + MI.eraseFromParent(); + } else if (HaveSkipBlock) { + // Remove the given unconditional branch when a skip block has been + // inserted after the current one and let skip the two instructions + // performing the kill if the exec mask is non-zero. MI.eraseFromParent(); + } break; - } - case AMDGPU::SI_KILL_TERMINATOR: { + + case AMDGPU::SI_KILL_TERMINATOR: MadeChange = true; kill(MI); if (ExecBranchStack.empty()) { if (skipIfDead(MI, *NextBB)) { + HaveSkipBlock = true; NextBB = std::next(BI); BE = MF.end(); - Next = MBB.end(); } } else { HaveKill = true; @@ -298,15 +317,15 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); break; - } - case AMDGPU::SI_RETURN: { + + case AMDGPU::SI_RETURN_TO_EPILOG: // FIXME: Should move somewhere else assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); // Graphics shaders returning non-void shouldn't contain S_ENDPGM, // because external bytecode will be appended at the end. if (BI != --MF.end() || I != MBB.getFirstTerminator()) { - // SI_RETURN is not the last instruction. Add an empty block at + // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at // the end and jump there. if (!EmptyMBBAtEnd) { EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); @@ -318,7 +337,8 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { .addMBB(EmptyMBBAtEnd); I->eraseFromParent(); } - } + break; + default: break; } diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp new file mode 100644 index 000000000000..c2a3e62aa827 --- /dev/null +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -0,0 +1,1863 @@ +//===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/ +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Insert wait instructions for memory reads and writes. +/// +/// Memory reads and writes are issued asynchronously, so we need to insert +/// S_WAITCNT instructions when we want to access any of their results or +/// overwrite any register that's used asynchronously. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +#define DEBUG_TYPE "si-insert-waitcnts" + +using namespace llvm; + +namespace { + +// Class of object that encapsulates latest instruction counter score +// associated with the operand. Used for determining whether +// s_waitcnt instruction needs to be emited. + +#define CNT_MASK(t) (1u << (t)) + +enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS }; + +typedef std::pair<signed, signed> RegInterval; + +struct { + int32_t VmcntMax; + int32_t ExpcntMax; + int32_t LgkmcntMax; + int32_t NumVGPRsMax; + int32_t NumSGPRsMax; +} HardwareLimits; + +struct { + unsigned VGPR0; + unsigned VGPRL; + unsigned SGPR0; + unsigned SGPRL; +} RegisterEncoding; + +enum WaitEventType { + VMEM_ACCESS, // vector-memory read & write + LDS_ACCESS, // lds read & write + GDS_ACCESS, // gds read & write + SQ_MESSAGE, // send message + SMEM_ACCESS, // scalar-memory read & write + EXP_GPR_LOCK, // export holding on its data src + GDS_GPR_LOCK, // GDS holding on its data and addr src + EXP_POS_ACCESS, // write to export position + EXP_PARAM_ACCESS, // write to export parameter + VMW_GPR_LOCK, // vector-memory write holding on its data src + NUM_WAIT_EVENTS, +}; + +// The mapping is: +// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs +// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots +// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs +// We reserve a fixed number of VGPR slots in the scoring tables for +// special tokens like SCMEM_LDS (needed for buffer load to LDS). +enum RegisterMapping { + SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets. + SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. + NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. + EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses. + NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. +}; + +#define ForAllWaitEventType(w) \ + for (enum WaitEventType w = (enum WaitEventType)0; \ + (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \ + (w) = (enum WaitEventType)((w) + 1)) + +// This is a per-basic-block object that maintains current score brackets +// of each wait-counter, and a per-register scoreboard for each wait-couner. +// We also maintain the latest score for every event type that can change the +// waitcnt in order to know if there are multiple types of events within +// the brackets. When multiple types of event happen in the bracket, +// wait-count may get decreased out of order, therefore we need to put in +// "s_waitcnt 0" before use. +class BlockWaitcntBrackets { +public: + static int32_t getWaitCountMax(InstCounterType T) { + switch (T) { + case VM_CNT: + return HardwareLimits.VmcntMax; + case LGKM_CNT: + return HardwareLimits.LgkmcntMax; + case EXP_CNT: + return HardwareLimits.ExpcntMax; + default: + break; + } + return 0; + }; + + void setScoreLB(InstCounterType T, int32_t Val) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return; + ScoreLBs[T] = Val; + }; + + void setScoreUB(InstCounterType T, int32_t Val) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return; + ScoreUBs[T] = Val; + if (T == EXP_CNT) { + int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT)); + if (ScoreLBs[T] < UB) + ScoreLBs[T] = UB; + } + }; + + int32_t getScoreLB(InstCounterType T) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return 0; + return ScoreLBs[T]; + }; + + int32_t getScoreUB(InstCounterType T) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return 0; + return ScoreUBs[T]; + }; + + // Mapping from event to counter. + InstCounterType eventCounter(WaitEventType E) { + switch (E) { + case VMEM_ACCESS: + return VM_CNT; + case LDS_ACCESS: + case GDS_ACCESS: + case SQ_MESSAGE: + case SMEM_ACCESS: + return LGKM_CNT; + case EXP_GPR_LOCK: + case GDS_GPR_LOCK: + case VMW_GPR_LOCK: + case EXP_POS_ACCESS: + case EXP_PARAM_ACCESS: + return EXP_CNT; + default: + llvm_unreachable("unhandled event type"); + } + return NUM_INST_CNTS; + } + + void setRegScore(int GprNo, InstCounterType T, int32_t Val) { + if (GprNo < NUM_ALL_VGPRS) { + if (GprNo > VgprUB) { + VgprUB = GprNo; + } + VgprScores[T][GprNo] = Val; + } else { + assert(T == LGKM_CNT); + if (GprNo - NUM_ALL_VGPRS > SgprUB) { + SgprUB = GprNo - NUM_ALL_VGPRS; + } + SgprScores[GprNo - NUM_ALL_VGPRS] = Val; + } + } + + int32_t getRegScore(int GprNo, InstCounterType T) { + if (GprNo < NUM_ALL_VGPRS) { + return VgprScores[T][GprNo]; + } + return SgprScores[GprNo - NUM_ALL_VGPRS]; + } + + void clear() { + memset(ScoreLBs, 0, sizeof(ScoreLBs)); + memset(ScoreUBs, 0, sizeof(ScoreUBs)); + memset(EventUBs, 0, sizeof(EventUBs)); + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + memset(VgprScores[T], 0, sizeof(VgprScores[T])); + } + memset(SgprScores, 0, sizeof(SgprScores)); + } + + RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, unsigned OpNo, + bool Def) const; + + void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, + const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, + unsigned OpNo, int32_t Val); + + void setWaitAtBeginning() { WaitAtBeginning = true; } + void clearWaitAtBeginning() { WaitAtBeginning = false; } + bool getWaitAtBeginning() const { return WaitAtBeginning; } + void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; } + int32_t getMaxVGPR() const { return VgprUB; } + int32_t getMaxSGPR() const { return SgprUB; } + int32_t getEventUB(enum WaitEventType W) const { + assert(W < NUM_WAIT_EVENTS); + return EventUBs[W]; + } + bool counterOutOfOrder(InstCounterType T); + unsigned int updateByWait(InstCounterType T, int ScoreToWait); + void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, WaitEventType E, + MachineInstr &MI); + + BlockWaitcntBrackets() + : WaitAtBeginning(false), ValidLoop(false), MixedExpTypes(false), + LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + memset(VgprScores[T], 0, sizeof(VgprScores[T])); + } + } + ~BlockWaitcntBrackets(){}; + + bool hasPendingSMEM() const { + return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && + EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]); + } + + bool hasPendingFlat() const { + return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && + LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || + (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] && + LastFlat[VM_CNT] <= ScoreUBs[VM_CNT])); + } + + void setPendingFlat() { + LastFlat[VM_CNT] = ScoreUBs[VM_CNT]; + LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; + } + + int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; } + + void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; } + + bool getRevisitLoop() const { return RevisitLoop; } + void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; } + + void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; } + int32_t getPostOrder() const { return PostOrder; } + + void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; } + void clearWaitcnt() { Waitcnt = NULL; } + MachineInstr *getWaitcnt() const { return Waitcnt; } + + bool mixedExpTypes() const { return MixedExpTypes; } + void setMixedExpTypes(bool MixedExpTypesIn) { + MixedExpTypes = MixedExpTypesIn; + } + + void print(raw_ostream &); + void dump() { print(dbgs()); } + +private: + bool WaitAtBeginning; + bool RevisitLoop; + bool ValidLoop; + bool MixedExpTypes; + MachineLoop *LoopRegion; + int32_t PostOrder; + MachineInstr *Waitcnt; + int32_t ScoreLBs[NUM_INST_CNTS] = {0}; + int32_t ScoreUBs[NUM_INST_CNTS] = {0}; + int32_t EventUBs[NUM_WAIT_EVENTS] = {0}; + // Remember the last flat memory operation. + int32_t LastFlat[NUM_INST_CNTS] = {0}; + // wait_cnt scores for every vgpr. + // Keep track of the VgprUB and SgprUB to make merge at join efficient. + int32_t VgprUB; + int32_t SgprUB; + int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; + // Wait cnt scores for every sgpr, only lgkmcnt is relevant. + int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; +}; + +// This is a per-loop-region object that records waitcnt status at the end of +// loop footer from the previous iteration. We also maintain an iteration +// count to track the number of times the loop has been visited. When it +// doesn't converge naturally, we force convergence by inserting s_waitcnt 0 +// at the end of the loop footer. +class LoopWaitcntData { +public: + void incIterCnt() { IterCnt++; } + void resetIterCnt() { IterCnt = 0; } + int32_t getIterCnt() { return IterCnt; } + + LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {} + ~LoopWaitcntData(){}; + + void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; } + MachineInstr *getWaitcnt() const { return LfWaitcnt; } + + void print() { + DEBUG(dbgs() << " iteration " << IterCnt << '\n';); + return; + } + +private: + // s_waitcnt added at the end of loop footer to stablize wait scores + // at the end of the loop footer. + MachineInstr *LfWaitcnt; + // Number of iterations the loop has been visited, not including the initial + // walk over. + int32_t IterCnt; +}; + +class SIInsertWaitcnts : public MachineFunctionPass { + +private: + const SISubtarget *ST; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + const MachineLoopInfo *MLI; + AMDGPU::IsaInfo::IsaVersion IV; + AMDGPUAS AMDGPUASI; + + DenseSet<MachineBasicBlock *> BlockVisitedSet; + DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet; + DenseSet<MachineInstr *> VCCZBugHandledSet; + + DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>> + BlockWaitcntBracketsMap; + + DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet; + + DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap; + + std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets; + +public: + static char ID; + + SIInsertWaitcnts() + : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr), + MRI(nullptr), MLI(nullptr) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI insert wait instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + void addKillWaitBracket(BlockWaitcntBrackets *Bracket) { + // The waitcnt information is copied because it changes as the block is + // traversed. + KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket)); + } + + MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI, + BlockWaitcntBrackets *ScoreBrackets); + void updateEventWaitCntAfter(MachineInstr &Inst, + BlockWaitcntBrackets *ScoreBrackets); + void mergeInputScoreBrackets(MachineBasicBlock &Block); + MachineBasicBlock *loopBottom(const MachineLoop *Loop); + void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block); + void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); +}; + +} // End anonymous namespace. + +RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, + const SIInstrInfo *TII, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, + unsigned OpNo, + bool Def) const { + const MachineOperand &Op = MI->getOperand(OpNo); + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || + (Def && !Op.isDef())) + return {-1, -1}; + + // A use via a PW operand does not need a waitcnt. + // A partial write is not a WAW. + assert(!Op.getSubReg() || !Op.isUndef()); + + RegInterval Result; + const MachineRegisterInfo &MRIA = *MRI; + + unsigned Reg = TRI->getEncodingValue(Op.getReg()); + + if (TRI->isVGPR(MRIA, Op.getReg())) { + assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); + Result.first = Reg - RegisterEncoding.VGPR0; + assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); + } else if (TRI->isSGPRReg(MRIA, Op.getReg())) { + assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); + Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS; + assert(Result.first >= NUM_ALL_VGPRS && + Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); + } + // TODO: Handle TTMP + // else if (TRI->isTTMP(MRIA, Reg.getReg())) ... + else + return {-1, -1}; + + const MachineInstr &MIA = *MI; + const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo); + unsigned Size = RC->getSize(); + Result.second = Result.first + (Size / 4); + + return Result; +} + +void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + unsigned OpNo, int32_t Val) { + RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false); + DEBUG({ + const MachineOperand &Opnd = MI->getOperand(OpNo); + assert(TRI->isVGPR(*MRI, Opnd.getReg())); + }); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + setRegScore(RegNo, EXP_CNT, Val); + } +} + +void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + WaitEventType E, MachineInstr &Inst) { + const MachineRegisterInfo &MRIA = *MRI; + InstCounterType T = eventCounter(E); + int32_t CurrScore = getScoreUB(T) + 1; + // EventUB and ScoreUB need to be update regardless if this event changes + // the score of a register or not. + // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. + EventUBs[E] = CurrScore; + setScoreUB(T, CurrScore); + + if (T == EXP_CNT) { + // Check for mixed export types. If they are mixed, then a waitcnt exp(0) + // is required. + if (!MixedExpTypes) { + MixedExpTypes = counterOutOfOrder(EXP_CNT); + } + + // Put score on the source vgprs. If this is a store, just use those + // specific register(s). + if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { + // All GDS operations must protect their address register (same as + // export.) + if (Inst.getOpcode() != AMDGPU::DS_APPEND && + Inst.getOpcode() != AMDGPU::DS_CONSUME) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr), + CurrScore); + } + if (Inst.mayStore()) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), + CurrScore); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::data1) != -1) { + setExpScore(&Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::data1), + CurrScore); + } + } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 && + Inst.getOpcode() != AMDGPU::DS_GWS_INIT && + Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V && + Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR && + Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P && + Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER && + Inst.getOpcode() != AMDGPU::DS_APPEND && + Inst.getOpcode() != AMDGPU::DS_CONSUME && + Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { + for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { + const MachineOperand &Op = Inst.getOperand(I); + if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) { + setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); + } + } + } + } else if (TII->isFLAT(Inst)) { + if (Inst.mayStore()) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), + CurrScore); + } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), + CurrScore); + } + } else if (TII->isMIMG(Inst)) { + if (Inst.mayStore()) { + setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); + } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), + CurrScore); + } + } else if (TII->isMTBUF(Inst)) { + if (Inst.mayStore()) { + setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); + } + } else if (TII->isMUBUF(Inst)) { + if (Inst.mayStore()) { + setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore); + } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), + CurrScore); + } + } else { + if (TII->isEXP(Inst)) { + // For export the destination registers are really temps that + // can be used as the actual source after export patching, so + // we need to treat them like sources and set the EXP_CNT + // score. + for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { + MachineOperand &DefMO = Inst.getOperand(I); + if (DefMO.isReg() && DefMO.isDef() && + TRI->isVGPR(MRIA, DefMO.getReg())) { + setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT, + CurrScore); + } + } + } + for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { + MachineOperand &MO = Inst.getOperand(I); + if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) { + setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); + } + } + } +#if 0 // TODO: check if this is handled by MUBUF code above. + } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD || + Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || + Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { + MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); + unsigned OpNo;//TODO: find the OpNo for this operand; + RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false); + for (signed RegNo = Interval.first; RegNo < Interval.second; + ++RegNo) { + setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); + } +#endif + } else { + // Match the score to the destination registers. + for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { + RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true); + if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS) + continue; + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + setRegScore(RegNo, T, CurrScore); + } + } + if (TII->isDS(Inst) && Inst.mayStore()) { + setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); + } + } +} + +void BlockWaitcntBrackets::print(raw_ostream &OS) { + OS << '\n'; + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + int LB = getScoreLB(T); + int UB = getScoreUB(T); + + switch (T) { + case VM_CNT: + OS << " VM_CNT(" << UB - LB << "): "; + break; + case LGKM_CNT: + OS << " LGKM_CNT(" << UB - LB << "): "; + break; + case EXP_CNT: + OS << " EXP_CNT(" << UB - LB << "): "; + break; + default: + OS << " UNKNOWN(" << UB - LB << "): "; + break; + } + + if (LB < UB) { + // Print vgpr scores. + for (int J = 0; J <= getMaxVGPR(); J++) { + int RegScore = getRegScore(J, T); + if (RegScore <= LB) + continue; + int RelScore = RegScore - LB - 1; + if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { + OS << RelScore << ":v" << J << " "; + } else { + OS << RelScore << ":ds "; + } + } + // Also need to print sgpr scores for lgkm_cnt. + if (T == LGKM_CNT) { + for (int J = 0; J <= getMaxSGPR(); J++) { + int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + if (RegScore <= LB) + continue; + int RelScore = RegScore - LB - 1; + OS << RelScore << ":s" << J << " "; + } + } + } + OS << '\n'; + } + OS << '\n'; + return; +} + +unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, + int ScoreToWait) { + unsigned int NeedWait = 0; + if (ScoreToWait == -1) { + // The score to wait is unknown. This implies that it was not encountered + // during the path of the CFG walk done during the current traversal but + // may be seen on a different path. Emit an s_wait counter with a + // conservative value of 0 for the counter. + NeedWait = CNT_MASK(T); + setScoreLB(T, getScoreUB(T)); + return NeedWait; + } + + // If the score of src_operand falls within the bracket, we need an + // s_waitcnt instruction. + const int32_t LB = getScoreLB(T); + const int32_t UB = getScoreUB(T); + if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { + if (T == VM_CNT && hasPendingFlat()) { + // If there is a pending FLAT operation, and this is a VM waitcnt, + // then we need to force a waitcnt 0 for VM. + NeedWait = CNT_MASK(T); + setScoreLB(T, getScoreUB(T)); + } else if (counterOutOfOrder(T)) { + // Counter can get decremented out-of-order when there + // are multiple types event in the brack. Also emit an s_wait counter + // with a conservative value of 0 for the counter. + NeedWait = CNT_MASK(T); + setScoreLB(T, getScoreUB(T)); + } else { + NeedWait = CNT_MASK(T); + setScoreLB(T, ScoreToWait); + } + } + + return NeedWait; +} + +// Where there are multiple types of event in the bracket of a counter, +// the decrement may go out of order. +bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) { + switch (T) { + case VM_CNT: + return false; + case LGKM_CNT: { + if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && + EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) { + // Scalar memory read always can go out of order. + return true; + } + int NumEventTypes = 0; + if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] && + EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { + NumEventTypes++; + } + if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] && + EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { + NumEventTypes++; + } + if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] && + EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) { + NumEventTypes++; + } + if (NumEventTypes <= 1) { + return false; + } + break; + } + case EXP_CNT: { + // If there has been a mixture of export types, then a waitcnt exp(0) is + // required. + if (MixedExpTypes) + return true; + int NumEventTypes = 0; + if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] && + EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { + NumEventTypes++; + } + if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] && + EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { + NumEventTypes++; + } + if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] && + EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { + NumEventTypes++; + } + if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] && + EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) { + NumEventTypes++; + } + + if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] && + EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) { + NumEventTypes++; + } + + if (NumEventTypes <= 1) { + return false; + } + break; + } + default: + break; + } + return true; +} + +INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, + false) +INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, + false) + +char SIInsertWaitcnts::ID = 0; + +char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; + +FunctionPass *llvm::createSIInsertWaitcntsPass() { + return new SIInsertWaitcnts(); +} + +static bool readsVCCZ(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && + !MI.getOperand(1).isUndef(); +} + +/// \brief Generate s_waitcnt instruction to be placed before cur_Inst. +/// Instructions of a given type are returned in order, +/// but instructions of different types can complete out of order. +/// We rely on this in-order completion +/// and simply assign a score to the memory access instructions. +/// We keep track of the active "score bracket" to determine +/// if an access of a memory read requires an s_waitcnt +/// and if so what the value of each counter is. +/// The "score bracket" is bound by the lower bound and upper bound +/// scores (*_score_LB and *_score_ub respectively). +MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( + MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) { + // To emit, or not to emit - that's the question! + // Start with an assumption that there is no need to emit. + unsigned int EmitSwaitcnt = 0; + // s_waitcnt instruction to return; default is NULL. + MachineInstr *SWaitInst = nullptr; + // No need to wait before phi. If a phi-move exists, then the wait should + // has been inserted before the move. If a phi-move does not exist, then + // wait should be inserted before the real use. The same is true for + // sc-merge. It is not a coincident that all these cases correspond to the + // instructions that are skipped in the assembling loop. + bool NeedLineMapping = false; // TODO: Check on this. + if (MI.isDebugValue() && + // TODO: any other opcode? + !NeedLineMapping) { + return SWaitInst; + } + + // See if an s_waitcnt is forced at block entry, or is needed at + // program end. + if (ScoreBrackets->getWaitAtBeginning()) { + // Note that we have already cleared the state, so we don't need to update + // it. + ScoreBrackets->clearWaitAtBeginning(); + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + EmitSwaitcnt |= CNT_MASK(T); + ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); + } + } + + // See if this instruction has a forced S_WAITCNT VM. + // TODO: Handle other cases of NeedsWaitcntVmBefore() + else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || + MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || + MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { + EmitSwaitcnt |= + ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + } + + // All waits must be resolved at call return. + // NOTE: this could be improved with knowledge of all call sites or + // with knowledge of the called routines. + if (MI.getOpcode() == AMDGPU::RETURN || + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { + ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); + EmitSwaitcnt |= CNT_MASK(T); + } + } + } + // Resolve vm waits before gs-done. + else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || + MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && + ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == + AMDGPU::SendMsg::ID_GS_DONE)) { + if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) { + ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + EmitSwaitcnt |= CNT_MASK(VM_CNT); + } + } +#if 0 // TODO: the following blocks of logic when we have fence. + else if (MI.getOpcode() == SC_FENCE) { + const unsigned int group_size = + context->shader_info->GetMaxThreadGroupSize(); + // group_size == 0 means thread group size is unknown at compile time + const bool group_is_multi_wave = + (group_size == 0 || group_size > target_info->GetWaveFrontSize()); + const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence(); + + for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) { + SCRegType src_type = Inst->GetSrcType(i); + switch (src_type) { + case SCMEM_LDS: + if (group_is_multi_wave || + context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { + EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, + ScoreBrackets->getScoreUB(LGKM_CNT)); + // LDS may have to wait for VM_CNT after buffer load to LDS + if (target_info->HasBufferLoadToLDS()) { + EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, + ScoreBrackets->getScoreUB(VM_CNT)); + } + } + break; + + case SCMEM_GDS: + if (group_is_multi_wave || fence_is_global) { + EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, + ScoreBrackets->getScoreUB(EXP_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, + ScoreBrackets->getScoreUB(LGKM_CNT)); + } + break; + + case SCMEM_UAV: + case SCMEM_TFBUF: + case SCMEM_RING: + case SCMEM_SCATTER: + if (group_is_multi_wave || fence_is_global) { + EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, + ScoreBrackets->getScoreUB(EXP_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, + ScoreBrackets->getScoreUB(VM_CNT)); + } + break; + + case SCMEM_SCRATCH: + default: + break; + } + } + } +#endif + + // Export & GDS instructions do not read the EXEC mask until after the export + // is granted (which can occur well after the instruction is issued). + // The shader program must flush all EXP operations on the export-count + // before overwriting the EXEC mask. + else { + if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { + // Export and GDS are tracked individually, either may trigger a waitcnt + // for EXEC. + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK)); + } + +#if 0 // TODO: the following code to handle CALL. + // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT. + // However, there is a problem with EXP_CNT, because the call cannot + // easily tell if a register is used in the function, and if it did, then + // the referring instruction would have to have an S_WAITCNT, which is + // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs + // before the call. + if (MI.getOpcode() == SC_CALL) { + if (ScoreBrackets->getScoreUB(EXP_CNT) > + ScoreBrackets->getScoreLB(EXP_CNT)) { + ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); + EmitSwaitcnt |= CNT_MASK(EXP_CNT); + } + } +#endif + + // Look at the source operands of every instruction to see if + // any of them results from a previous memory operation that affects + // its current usage. If so, an s_waitcnt instruction needs to be + // emitted. + // If the source operand was defined by a load, add the s_waitcnt + // instruction. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS != AMDGPUASI.LOCAL_ADDRESS) + continue; + unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; + // VM_CNT is only relevant to vgpr or LDS. + EmitSwaitcnt |= ScoreBrackets->updateByWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + } + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + const MachineOperand &Op = MI.getOperand(I); + const MachineRegisterInfo &MRIA = *MRI; + RegInterval Interval = + ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(MRIA, Op.getReg())) { + // VM_CNT is only relevant to vgpr or LDS. + EmitSwaitcnt |= ScoreBrackets->updateByWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + } + EmitSwaitcnt |= ScoreBrackets->updateByWait( + LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); + } + } + // End of for loop that looks at all source operands to decide vm_wait_cnt + // and lgk_wait_cnt. + + // Two cases are handled for destination operands: + // 1) If the destination operand was defined by a load, add the s_waitcnt + // instruction to guarantee the right WAW order. + // 2) If a destination operand that was used by a recent export/store ins, + // add s_waitcnt on exp_cnt to guarantee the WAR order. + if (MI.mayStore()) { + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS != AMDGPUASI.LOCAL_ADDRESS) + continue; + unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; + EmitSwaitcnt |= ScoreBrackets->updateByWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); + } + } + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + MachineOperand &Def = MI.getOperand(I); + const MachineRegisterInfo &MRIA = *MRI; + RegInterval Interval = + ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true); + for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(MRIA, Def.getReg())) { + EmitSwaitcnt |= ScoreBrackets->updateByWait( + VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); + } + EmitSwaitcnt |= ScoreBrackets->updateByWait( + LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); + } + } // End of for loop that looks at all dest operands. + } + + // TODO: Tie force zero to a compiler triage option. + bool ForceZero = false; + + // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 + // occurs before the instruction. Doing it here prevents any additional + // S_WAITCNTs from being emitted if the instruction was marked as + // requiring a WAITCNT beforehand. + if (MI.getOpcode() == AMDGPU::S_BARRIER && ST->needWaitcntBeforeBarrier()) { + EmitSwaitcnt |= + ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); + EmitSwaitcnt |= ScoreBrackets->updateByWait( + LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); + } + + // TODO: Remove this work-around, enable the assert for Bug 457939 + // after fixing the scheduler. Also, the Shader Compiler code is + // independent of target. + if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { + if (ScoreBrackets->getScoreLB(LGKM_CNT) < + ScoreBrackets->getScoreUB(LGKM_CNT) && + ScoreBrackets->hasPendingSMEM()) { + // Wait on everything, not just LGKM. vccz reads usually come from + // terminators, and we always wait on everything at the end of the + // block, so if we only wait on LGKM here, we might end up with + // another s_waitcnt inserted right after this if there are non-LGKM + // instructions still outstanding. + ForceZero = true; + EmitSwaitcnt = true; + } + } + + // Does this operand processing indicate s_wait counter update? + if (EmitSwaitcnt) { + int CntVal[NUM_INST_CNTS]; + + bool UseDefaultWaitcntStrategy = true; + if (ForceZero) { + // Force all waitcnts to 0. + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); + } + CntVal[VM_CNT] = 0; + CntVal[EXP_CNT] = 0; + CntVal[LGKM_CNT] = 0; + UseDefaultWaitcntStrategy = false; + } + + if (UseDefaultWaitcntStrategy) { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + if (EmitSwaitcnt & CNT_MASK(T)) { + int Delta = + ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T); + int MaxDelta = ScoreBrackets->getWaitCountMax(T); + if (Delta >= MaxDelta) { + Delta = -1; + if (T != EXP_CNT) { + ScoreBrackets->setScoreLB( + T, ScoreBrackets->getScoreUB(T) - MaxDelta); + } + EmitSwaitcnt &= ~CNT_MASK(T); + } + CntVal[T] = Delta; + } else { + // If we are not waiting for a particular counter then encode + // it as -1 which means "don't care." + CntVal[T] = -1; + } + } + } + + // If we are not waiting on any counter we can skip the wait altogether. + if (EmitSwaitcnt != 0) { + MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); + int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); + if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) != + (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) || + (AMDGPU::decodeExpcnt(IV, Imm) != + (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) || + (AMDGPU::decodeLgkmcnt(IV, Imm) != + (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) { + MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent()); + if (ContainingLoop) { + MachineBasicBlock *TBB = ContainingLoop->getTopBlock(); + BlockWaitcntBrackets *ScoreBracket = + BlockWaitcntBracketsMap[TBB].get(); + if (!ScoreBracket) { + assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end()); + BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>(); + ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); + } + ScoreBracket->setRevisitLoop(true); + DEBUG(dbgs() << "set-revisit: block" + << ContainingLoop->getTopBlock()->getNumber() << '\n';); + } + } + + // Update an existing waitcount, or make a new one. + MachineFunction &MF = *MI.getParent()->getParent(); + if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) { + SWaitInst = OldWaitcnt; + } else { + SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT), + MI.getDebugLoc()); + CompilerGeneratedWaitcntSet.insert(SWaitInst); + } + + const MachineOperand &Op = + MachineOperand::CreateImm(AMDGPU::encodeWaitcnt( + IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT])); + SWaitInst->addOperand(MF, Op); + + if (CntVal[EXP_CNT] == 0) { + ScoreBrackets->setMixedExpTypes(false); + } + } + } + + return SWaitInst; +} + +void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB, + MachineInstr *Waitcnt) { + if (MBB.empty()) { + MBB.push_back(Waitcnt); + return; + } + + MachineBasicBlock::iterator It = MBB.end(); + MachineInstr *MI = &*(--It); + if (MI->isBranch()) { + MBB.insert(It, Waitcnt); + } else { + MBB.push_back(Waitcnt); + } + + return; +} + +void SIInsertWaitcnts::updateEventWaitCntAfter( + MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) { + // Now look at the instruction opcode. If it is a memory access + // instruction, update the upper-bound of the appropriate counter's + // bracket and the destination operand scores. + // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. + if (TII->isDS(Inst) && (Inst.mayLoad() || Inst.mayStore())) { + if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); + ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); + } else { + ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); + } + } else if (TII->isFLAT(Inst)) { + assert(Inst.mayLoad() || Inst.mayStore()); + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); + + // This is a flat memory operation. Check to see if it has memory + // tokens for both LDS and Memory, and if so mark it as a flat. + bool FoundLDSMem = false; + for (const MachineMemOperand *Memop : Inst.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) + FoundLDSMem = true; + } + + // This is a flat memory operation, so note it - it will require + // that both the VM and LGKM be flushed to zero if it is pending when + // a VM or LGKM dependency occurs. + if (FoundLDSMem) { + ScoreBrackets->setPendingFlat(); + } + } else if (SIInstrInfo::isVMEM(Inst) && + // TODO: get a better carve out. + Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 && + Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC && + Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() && + (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()))) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); + } + } else if (TII->isSMRD(Inst)) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); + } else { + switch (Inst.getOpcode()) { + case AMDGPU::S_SENDMSG: + case AMDGPU::S_SENDMSGHALT: + ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); + break; + case AMDGPU::EXP: + case AMDGPU::EXP_DONE: { + int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); + if (Imm >= 32 && Imm <= 63) + ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst); + else if (Imm >= 12 && Imm <= 15) + ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst); + else + ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst); + break; + } + case AMDGPU::S_MEMTIME: + case AMDGPU::S_MEMREALTIME: + ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); + break; + default: + break; + } + } +} + +void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { + BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); + int32_t MaxPending[NUM_INST_CNTS] = {0}; + int32_t MaxFlat[NUM_INST_CNTS] = {0}; + bool MixedExpTypes = false; + + // Clear the score bracket state. + ScoreBrackets->clear(); + + // Compute the number of pending elements on block entry. + + // IMPORTANT NOTE: If iterative handling of loops is added, the code will + // need to handle single BBs with backedges to themselves. This means that + // they will need to retain and not clear their initial state. + + // See if there are any uninitialized predecessors. If so, emit an + // s_waitcnt 0 at the beginning of the block. + for (MachineBasicBlock *pred : Block.predecessors()) { + BlockWaitcntBrackets *PredScoreBrackets = + BlockWaitcntBracketsMap[pred].get(); + bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end(); + if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { + break; + } + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + int span = + PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T); + MaxPending[T] = std::max(MaxPending[T], span); + span = + PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T); + MaxFlat[T] = std::max(MaxFlat[T], span); + } + + MixedExpTypes |= PredScoreBrackets->mixedExpTypes(); + } + + // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? + // Also handle kills for exit block. + if (Block.succ_empty() && !KillWaitBrackets.empty()) { + for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + int Span = KillWaitBrackets[I]->getScoreUB(T) - + KillWaitBrackets[I]->getScoreLB(T); + MaxPending[T] = std::max(MaxPending[T], Span); + Span = KillWaitBrackets[I]->pendingFlat(T) - + KillWaitBrackets[I]->getScoreLB(T); + MaxFlat[T] = std::max(MaxFlat[T], Span); + } + + MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes(); + } + } + + // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK. + for (MachineBasicBlock *Pred : Block.predecessors()) { + BlockWaitcntBrackets *PredScoreBrackets = + BlockWaitcntBracketsMap[Pred].get(); + bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); + if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { + break; + } + + int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) - + PredScoreBrackets->getScoreLB(EXP_CNT); + MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); + int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) - + PredScoreBrackets->getScoreLB(EXP_CNT); + MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); + } + + // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? + if (Block.succ_empty() && !KillWaitBrackets.empty()) { + for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { + int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) - + KillWaitBrackets[I]->getScoreLB(EXP_CNT); + MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); + int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) - + KillWaitBrackets[I]->getScoreLB(EXP_CNT); + MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); + } + } + +#if 0 + // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker. + // TODO: how does LC distinguish between function entry and main entry? + // If this is the entry to a function, force a wait. + MachineBasicBlock &Entry = Block.getParent()->front(); + if (Entry.getNumber() == Block.getNumber()) { + ScoreBrackets->setWaitAtBeginning(); + return; + } +#endif + + // Now set the current Block's brackets to the largest ending bracket. + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + ScoreBrackets->setScoreUB(T, MaxPending[T]); + ScoreBrackets->setScoreLB(T, 0); + ScoreBrackets->setLastFlat(T, MaxFlat[T]); + } + + ScoreBrackets->setMixedExpTypes(MixedExpTypes); + + // Set the register scoreboard. + for (MachineBasicBlock *Pred : Block.predecessors()) { + if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { + break; + } + + BlockWaitcntBrackets *PredScoreBrackets = + BlockWaitcntBracketsMap[Pred].get(); + + // Now merge the gpr_reg_score information + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + int PredLB = PredScoreBrackets->getScoreLB(T); + int PredUB = PredScoreBrackets->getScoreUB(T); + if (PredLB < PredUB) { + int PredScale = MaxPending[T] - PredUB; + // Merge vgpr scores. + for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) { + int PredRegScore = PredScoreBrackets->getRegScore(J, T); + if (PredRegScore <= PredLB) + continue; + int NewRegScore = PredScale + PredRegScore; + ScoreBrackets->setRegScore( + J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); + } + // Also need to merge sgpr scores for lgkm_cnt. + if (T == LGKM_CNT) { + for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) { + int PredRegScore = + PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + if (PredRegScore <= PredLB) + continue; + int NewRegScore = PredScale + PredRegScore; + ScoreBrackets->setRegScore( + J + NUM_ALL_VGPRS, LGKM_CNT, + std::max( + ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), + NewRegScore)); + } + } + } + } + + // Also merge the WaitEvent information. + ForAllWaitEventType(W) { + enum InstCounterType T = PredScoreBrackets->eventCounter(W); + int PredEventUB = PredScoreBrackets->getEventUB(W); + if (PredEventUB > PredScoreBrackets->getScoreLB(T)) { + int NewEventUB = + MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T); + if (NewEventUB > 0) { + ScoreBrackets->setEventUB( + W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); + } + } + } + } + + // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? + // Set the register scoreboard. + if (Block.succ_empty() && !KillWaitBrackets.empty()) { + for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { + // Now merge the gpr_reg_score information. + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + int PredLB = KillWaitBrackets[I]->getScoreLB(T); + int PredUB = KillWaitBrackets[I]->getScoreUB(T); + if (PredLB < PredUB) { + int PredScale = MaxPending[T] - PredUB; + // Merge vgpr scores. + for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) { + int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T); + if (PredRegScore <= PredLB) + continue; + int NewRegScore = PredScale + PredRegScore; + ScoreBrackets->setRegScore( + J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); + } + // Also need to merge sgpr scores for lgkm_cnt. + if (T == LGKM_CNT) { + for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) { + int PredRegScore = + KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + if (PredRegScore <= PredLB) + continue; + int NewRegScore = PredScale + PredRegScore; + ScoreBrackets->setRegScore( + J + NUM_ALL_VGPRS, LGKM_CNT, + std::max( + ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), + NewRegScore)); + } + } + } + } + + // Also merge the WaitEvent information. + ForAllWaitEventType(W) { + enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W); + int PredEventUB = KillWaitBrackets[I]->getEventUB(W); + if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) { + int NewEventUB = + MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T); + if (NewEventUB > 0) { + ScoreBrackets->setEventUB( + W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); + } + } + } + } + } + + // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the + // sequencing predecessors, because changes to EXEC require waitcnts due to + // the delayed nature of these operations. + for (MachineBasicBlock *Pred : Block.predecessors()) { + if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { + break; + } + + BlockWaitcntBrackets *PredScoreBrackets = + BlockWaitcntBracketsMap[Pred].get(); + + int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK); + if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { + int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub - + PredScoreBrackets->getScoreUB(EXP_CNT); + if (new_gds_ub > 0) { + ScoreBrackets->setEventUB( + GDS_GPR_LOCK, + std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub)); + } + } + int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK); + if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { + int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub - + PredScoreBrackets->getScoreUB(EXP_CNT); + if (new_exp_ub > 0) { + ScoreBrackets->setEventUB( + EXP_GPR_LOCK, + std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub)); + } + } + } +} + +/// Return the "bottom" block of a loop. This differs from +/// MachineLoop::getBottomBlock in that it works even if the loop is +/// discontiguous. +MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) { + MachineBasicBlock *Bottom = Loop->getHeader(); + for (MachineBasicBlock *MBB : Loop->blocks()) + if (MBB->getNumber() > Bottom->getNumber()) + Bottom = MBB; + return Bottom; +} + +// Generate s_waitcnt instructions where needed. +void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, + MachineBasicBlock &Block) { + // Initialize the state information. + mergeInputScoreBrackets(Block); + + BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); + + DEBUG({ + dbgs() << "Block" << Block.getNumber(); + ScoreBrackets->dump(); + }); + + bool InsertNOP = false; + + // Walk over the instructions. + for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); + Iter != E;) { + MachineInstr &Inst = *Iter; + // Remove any previously existing waitcnts. + if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { + // TODO: Register the old waitcnt and optimize the following waitcnts. + // Leaving the previously existing waitcnts is conservatively correct. + if (CompilerGeneratedWaitcntSet.find(&Inst) == + CompilerGeneratedWaitcntSet.end()) + ++Iter; + else { + ScoreBrackets->setWaitcnt(&Inst); + ++Iter; + Inst.removeFromParent(); + } + continue; + } + + // Kill instructions generate a conditional branch to the endmain block. + // Merge the current waitcnt state into the endmain block information. + // TODO: Are there other flavors of KILL instruction? + if (Inst.getOpcode() == AMDGPU::KILL) { + addKillWaitBracket(ScoreBrackets); + } + + bool VCCZBugWorkAround = false; + if (readsVCCZ(Inst) && + (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) { + if (ScoreBrackets->getScoreLB(LGKM_CNT) < + ScoreBrackets->getScoreUB(LGKM_CNT) && + ScoreBrackets->hasPendingSMEM()) { + if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) + VCCZBugWorkAround = true; + } + } + + // Generate an s_waitcnt instruction to be placed before + // cur_Inst, if needed. + MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets); + + if (SWaitInst) { + Block.insert(Inst, SWaitInst); + if (ScoreBrackets->getWaitcnt() != SWaitInst) { + DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << Inst << '\n' + << "New Instr: " << *SWaitInst << '\n';); + } + } + + updateEventWaitCntAfter(Inst, ScoreBrackets); + +#if 0 // TODO: implement resource type check controlled by options with ub = LB. + // If this instruction generates a S_SETVSKIP because it is an + // indexed resource, and we are on Tahiti, then it will also force + // an S_WAITCNT vmcnt(0) + if (RequireCheckResourceType(Inst, context)) { + // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. + ScoreBrackets->setScoreLB(VM_CNT, + ScoreBrackets->getScoreUB(VM_CNT)); + } +#endif + + ScoreBrackets->clearWaitcnt(); + + if (SWaitInst) { + DEBUG({ SWaitInst->print(dbgs() << '\n'); }); + } + DEBUG({ + Inst.print(dbgs()); + ScoreBrackets->dump(); + }); + + // Check to see if this is a GWS instruction. If so, and if this is CI or + // VI, then the generated code sequence will include an S_WAITCNT 0. + // TODO: Are these the only GWS instructions? + if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT || + Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V || + Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || + Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || + Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { + // TODO: && context->target_info->GwsRequiresMemViolTest() ) { + ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); + ScoreBrackets->updateByWait(LGKM_CNT, + ScoreBrackets->getScoreUB(LGKM_CNT)); + } + + // TODO: Remove this work-around after fixing the scheduler and enable the + // assert above. + if (VCCZBugWorkAround) { + // Restore the vccz bit. Any time a value is written to vcc, the vcc + // bit is updated, so we can restore the bit by reading the value of + // vcc and then writing it back to the register. + BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), + AMDGPU::VCC) + .addReg(AMDGPU::VCC); + VCCZBugHandledSet.insert(&Inst); + } + + if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + + // This avoids a s_nop after a waitcnt has just been inserted. + if (!SWaitInst && InsertNOP) { + BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); + } + InsertNOP = false; + + // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM + // or SMEM clause, respectively. + // + // The temporary workaround is to break the clauses with S_NOP. + // + // The proper solution would be to allocate registers such that all source + // and destination registers don't overlap, e.g. this is illegal: + // r0 = load r2 + // r2 = load r0 + bool IsSMEM = false; + bool IsVMEM = false; + if (TII->isSMRD(Inst)) + IsSMEM = true; + else if (TII->usesVM_CNT(Inst)) + IsVMEM = true; + + ++Iter; + if (Iter == E) + break; + + MachineInstr &Next = *Iter; + + // TODO: How about consecutive SMEM instructions? + // The comments above says break the clause but the code does not. + // if ((TII->isSMRD(next) && isSMEM) || + if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM && + // TODO: Enable this check when hasSoftClause is upstreamed. + // ST->hasSoftClauses() && + ST->isXNACKEnabled()) { + // Insert a NOP to break the clause. + InsertNOP = true; + continue; + } + + // There must be "S_NOP 0" between an instruction writing M0 and + // S_SENDMSG. + if ((Next.getOpcode() == AMDGPU::S_SENDMSG || + Next.getOpcode() == AMDGPU::S_SENDMSGHALT) && + Inst.definesRegister(AMDGPU::M0)) + InsertNOP = true; + + continue; + } + + ++Iter; + } + + // Check if we need to force convergence at loop footer. + MachineLoop *ContainingLoop = MLI->getLoopFor(&Block); + if (ContainingLoop && loopBottom(ContainingLoop) == &Block) { + LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); + WaitcntData->print(); + DEBUG(dbgs() << '\n';); + + // The iterative waitcnt insertion algorithm aims for optimal waitcnt + // placement and doesn't always guarantee convergence for a loop. Each + // loop should take at most 2 iterations for it to converge naturally. + // When this max is reached and result doesn't converge, we force + // convergence by inserting a s_waitcnt at the end of loop footer. + if (WaitcntData->getIterCnt() > 2) { + // To ensure convergence, need to make wait events at loop footer be no + // more than those from the previous iteration. + // As a simplification, Instead of tracking individual scores and + // generate the precise wait count, just wait on 0. + bool HasPending = false; + MachineInstr *SWaitInst = WaitcntData->getWaitcnt(); + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { + ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); + HasPending = true; + } + } + + if (HasPending) { + if (!SWaitInst) { + SWaitInst = Block.getParent()->CreateMachineInstr( + TII->get(AMDGPU::S_WAITCNT), DebugLoc()); + CompilerGeneratedWaitcntSet.insert(SWaitInst); + const MachineOperand &Op = MachineOperand::CreateImm(0); + SWaitInst->addOperand(MF, Op); +#if 0 // TODO: Format the debug output + OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context); + OutputTransformAdd(SWaitInst, context); +#endif + } +#if 0 // TODO: ?? + _DEV( REPORTED_STATS->force_waitcnt_converge = 1; ) +#endif + } + + if (SWaitInst) { + DEBUG({ + SWaitInst->print(dbgs()); + dbgs() << "\nAdjusted score board:"; + ScoreBrackets->dump(); + }); + + // Add this waitcnt to the block. It is either newly created or + // created in previous iterations and added back since block traversal + // always remove waitcnt. + insertWaitcntBeforeCF(Block, SWaitInst); + WaitcntData->setWaitcnt(SWaitInst); + } + } + } +} + +bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget<SISubtarget>(); + TII = ST->getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + MLI = &getAnalysis<MachineLoopInfo>(); + IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); + AMDGPUASI = ST->getAMDGPUAS(); + + HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); + HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); + HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); + + HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs(); + HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs(); + assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS); + assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS); + + RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); + RegisterEncoding.VGPRL = + RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1; + RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); + RegisterEncoding.SGPRL = + RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; + + // Walk over the blocks in reverse post-dominator order, inserting + // s_waitcnt where needed. + ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); + bool Modified = false; + for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator + I = RPOT.begin(), + E = RPOT.end(), J = RPOT.begin(); + I != E;) { + MachineBasicBlock &MBB = **I; + + BlockVisitedSet.insert(&MBB); + + BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); + if (!ScoreBrackets) { + BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>(); + ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); + } + ScoreBrackets->setPostOrder(MBB.getNumber()); + MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB); + if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr) + LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>(); + + // If we are walking into the block from before the loop, then guarantee + // at least 1 re-walk over the loop to propagate the information, even if + // no S_WAITCNT instructions were generated. + if (ContainingLoop && ContainingLoop->getTopBlock() == &MBB && J < I && + (BlockWaitcntProcessedSet.find(&MBB) == + BlockWaitcntProcessedSet.end())) { + BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); + DEBUG(dbgs() << "set-revisit: block" + << ContainingLoop->getTopBlock()->getNumber() << '\n';); + } + + // Walk over the instructions. + insertWaitcntInBlock(MF, MBB); + + // Flag that waitcnts have been processed at least once. + BlockWaitcntProcessedSet.insert(&MBB); + + // See if we want to revisit the loop. + if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) { + MachineBasicBlock *EntryBB = ContainingLoop->getTopBlock(); + BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get(); + if (EntrySB && EntrySB->getRevisitLoop()) { + EntrySB->setRevisitLoop(false); + J = I; + int32_t PostOrder = EntrySB->getPostOrder(); + // TODO: Avoid this loop. Find another way to set I. + for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator + X = RPOT.begin(), + Y = RPOT.end(); + X != Y; ++X) { + MachineBasicBlock &MBBX = **X; + if (MBBX.getNumber() == PostOrder) { + I = X; + break; + } + } + LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); + WaitcntData->incIterCnt(); + DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';); + continue; + } else { + LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); + // Loop converged, reset iteration count. If this loop gets revisited, + // it must be from an outer loop, the counter will restart, this will + // ensure we don't force convergence on such revisits. + WaitcntData->resetIterCnt(); + } + } + + J = I; + ++I; + } + + SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; + + bool HaveScalarStores = false; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; + ++BI) { + + MachineBasicBlock &MBB = *BI; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; + ++I) { + + if (!HaveScalarStores && TII->isScalarStore(*I)) + HaveScalarStores = true; + + if (I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) + EndPgmBlocks.push_back(&MBB); + } + } + + if (HaveScalarStores) { + // If scalar writes are used, the cache must be flushed or else the next + // wave to reuse the same scratch memory can be clobbered. + // + // Insert s_dcache_wb at wave termination points if there were any scalar + // stores, and only if the cache hasn't already been flushed. This could be + // improved by looking across blocks for flushes in postdominating blocks + // from the stores but an explicitly requested flush is probably very rare. + for (MachineBasicBlock *MBB : EndPgmBlocks) { + bool SeenDCacheWB = false; + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) { + + if (I->getOpcode() == AMDGPU::S_DCACHE_WB) + SeenDCacheWB = true; + else if (TII->isScalarStore(*I)) + SeenDCacheWB = false; + + // FIXME: It would be better to insert this before a waitcnt if any. + if ((I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && + !SeenDCacheWB) { + Modified = true; + BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); + } + } + } + } + + return Modified; +} diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index fceabd7a8fdd..47257ce16ceb 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -21,16 +21,32 @@ #include "SIDefines.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <new> +#include <utility> #define DEBUG_TYPE "si-insert-waits" using namespace llvm; -using namespace llvm::AMDGPU; namespace { @@ -42,7 +58,6 @@ typedef union { unsigned LGKM; } Named; unsigned Array[3]; - } Counters; typedef enum { @@ -55,13 +70,12 @@ typedef Counters RegCounters[512]; typedef std::pair<unsigned, unsigned> RegInterval; class SIInsertWaits : public MachineFunctionPass { - private: - const SISubtarget *ST; - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; + const SISubtarget *ST = nullptr; + const SIInstrInfo *TII = nullptr; + const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI; - IsaVersion IV; + AMDGPU::IsaInfo::IsaVersion ISA; /// \brief Constant zero value static const Counters ZeroCounts; @@ -86,7 +100,7 @@ private: RegCounters DefinedRegs; /// \brief Different export instruction types seen since last wait. - unsigned ExpInstrTypesSeen; + unsigned ExpInstrTypesSeen = 0; /// \brief Type of the last opcode. InstType LastOpcodeType; @@ -100,7 +114,7 @@ private: bool ReturnsVoid; /// Whether the VCCZ bit is possibly corrupt - bool VCCZCorrupt; + bool VCCZCorrupt = false; /// \brief Get increment/decrement amount for this instruction. Counters getHwCounts(MachineInstr &MI); @@ -141,13 +155,7 @@ private: public: static char ID; - SIInsertWaits() : - MachineFunctionPass(ID), - ST(nullptr), - TII(nullptr), - TRI(nullptr), - ExpInstrTypesSeen(0), - VCCZCorrupt(false) { } + SIInsertWaits() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -161,7 +169,7 @@ public: } }; -} // End anonymous namespace +} // end anonymous namespace INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE, "SI Insert Waits", false, false) @@ -294,7 +302,6 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const Counters &Increment) { - // Get the hardware counter increments and sum them up Counters Limit = ZeroCounts; unsigned Sum = 0; @@ -366,7 +373,6 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const Counters &Required) { - // End of program? No need to wait on anything // A function not returning void needs to wait, because other bytecode will // be appended after it and we don't know what it will be. @@ -393,7 +399,6 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, bool NeedWait = false; for (unsigned i = 0; i < 3; ++i) { - if (Required.Array[i] <= WaitedOn.Array[i]) continue; @@ -421,10 +426,10 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, // Build the wait instruction BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(encodeWaitcnt(IV, - Counts.Named.VM, - Counts.Named.EXP, - Counts.Named.LGKM)); + .addImm(AMDGPU::encodeWaitcnt(ISA, + Counts.Named.VM, + Counts.Named.EXP, + Counts.Named.LGKM)); LastOpcodeType = OTHER; LastInstWritesM0 = false; @@ -434,7 +439,6 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, /// \brief helper function for handleOperands static void increaseCounters(Counters &Dst, const Counters &Src) { - for (unsigned i = 0; i < 3; ++i) Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); } @@ -453,9 +457,9 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { unsigned Imm = I->getOperand(0).getImm(); Counters Counts, WaitOn; - Counts.Named.VM = decodeVmcnt(IV, Imm); - Counts.Named.EXP = decodeExpcnt(IV, Imm); - Counts.Named.LGKM = decodeLgkmcnt(IV, Imm); + Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm); + Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm); + Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm); for (unsigned i = 0; i < 3; ++i) { if (Counts.Array[i] <= LastIssued.Array[i]) @@ -468,7 +472,6 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { } Counters SIInsertWaits::handleOperands(MachineInstr &MI) { - Counters Result = ZeroCounts; // For each register affected by this instruction increase the result @@ -484,7 +487,6 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { const TargetRegisterClass *RC = TII->getOpRegClass(MI, i); RegInterval Interval = getRegInterval(RC, Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { - if (Op.isDef()) { increaseCounters(Result, UsedRegs[j]); increaseCounters(Result, DefinedRegs[j]); @@ -522,6 +524,16 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, } } +/// Return true if \p MBB has one successor immediately following, and is its +/// only predecessor +static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) { + if (MBB.succ_size() != 1) + return false; + + const MachineBasicBlock *Succ = *MBB.succ_begin(); + return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ); +} + // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { @@ -531,12 +543,12 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - IV = getIsaVersion(ST->getFeatureBits()); + ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - HardwareLimits.Named.VM = getVmcntBitMask(IV); - HardwareLimits.Named.EXP = getExpcntBitMask(IV); - HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV); + HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA); + HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA); + HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA); WaitedOn = ZeroCounts; DelayedWaitOn = ZeroCounts; @@ -636,12 +648,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { handleSendMsg(MBB, I); if (I->getOpcode() == AMDGPU::S_ENDPGM || - I->getOpcode() == AMDGPU::SI_RETURN) + I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) EndPgmBlocks.push_back(&MBB); } - // Wait for everything at the end of the MBB - Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + // Wait for everything at the end of the MBB. If there is only one + // successor, we can defer this until the uses there. + if (!hasTrivialSuccessor(MBB)) + Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } if (HaveScalarStores) { @@ -665,7 +679,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // FIXME: It would be better to insert this before a waitcnt if any. if ((I->getOpcode() == AMDGPU::S_ENDPGM || - I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) { + I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) { Changes = true; BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); } @@ -676,5 +690,19 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { for (MachineInstr *I : RemoveMI) I->eraseFromParent(); + if (!MFI->isEntryFunction()) { + // Wait for any outstanding memory operations that the input registers may + // depend on. We can't track them and it's better to to the wait after the + // costly call sequence. + + // TODO: Could insert earlier and schedule more liberally with operations + // that only use caller preserved registers. + MachineBasicBlock &EntryBB = MF.front(); + BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + Changes = true; + } + return Changes; } diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index 5523ec142ba7..b83a1fe187eb 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -31,6 +31,7 @@ class InstSI <dag outs, dag ins, string asm = "", field bit VOP2 = 0; field bit VOPC = 0; field bit VOP3 = 0; + field bit VOP3P = 0; field bit VINTRP = 0; field bit SDWA = 0; field bit DPP = 0; @@ -78,6 +79,10 @@ class InstSI <dag outs, dag ins, string asm = "", // is unable to infer the encoding from the operands. field bit VOPAsmPrefer32Bit = 0; + // This bit indicates that this has a floating point result type, so + // the clamp modifier has floating point semantics. + field bit FPClamp = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -92,6 +97,7 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{8} = VOP2; let TSFlags{9} = VOPC; let TSFlags{10} = VOP3; + let TSFlags{12} = VOP3P; let TSFlags{13} = VINTRP; let TSFlags{14} = SDWA; @@ -120,6 +126,7 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{39} = ScalarStore; let TSFlags{40} = FixedSize; let TSFlags{41} = VOPAsmPrefer32Bit; + let TSFlags{42} = FPClamp; let SchedRW = [Write32Bit]; @@ -131,19 +138,19 @@ class InstSI <dag outs, dag ins, string asm = "", let AsmVariantName = AMDGPUAsmVariants.Default; } -class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []> - : InstSI<outs, ins, "", pattern> { +class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = ""> + : InstSI<outs, ins, asm, pattern> { let isPseudo = 1; let isCodeGenOnly = 1; } -class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []> - : PseudoInstSI<outs, ins, pattern> { +class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = ""> + : PseudoInstSI<outs, ins, pattern, asm> { let SALU = 1; } -class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []> - : PseudoInstSI<outs, ins, pattern> { +class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = ""> + : PseudoInstSI<outs, ins, pattern, asm> { let VALU = 1; let Uses = [EXEC]; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 26a8d22062a9..05ac67d26620 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" @@ -36,7 +37,7 @@ BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)")); SIInstrInfo::SIInstrInfo(const SISubtarget &ST) - : AMDGPUInstrInfo(ST), RI(), ST(ST) {} + : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -315,7 +316,8 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, const MachineOperand *SecondDst = nullptr; if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || - (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { + (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || + (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { @@ -346,6 +348,21 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; } +static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) { + MachineFunction *MF = MBB.getParent(); + DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(), + "illegal SGPR to VGPR copy", + DL, DS_Error); + LLVMContext &C = MF->getFunction()->getContext(); + C.diagnose(IllegalCopy); + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); +} + void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, @@ -369,7 +386,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; @@ -391,7 +412,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); + if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; @@ -415,8 +440,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opcode = AMDGPU::S_MOV_B32; EltSize = 4; } + + if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } } + ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); @@ -870,9 +901,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineInstr *MovRel = BuildMI(MBB, MI, DL, MovRelDesc) .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) - .addOperand(MI.getOperand(2)) + .add(MI.getOperand(2)) .addReg(VecReg, RegState::ImplicitDefine) - .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + .addReg(VecReg, + RegState::Implicit | (IsUndef ? RegState::Undef : 0)); const int ImpDefIdx = MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); @@ -897,14 +929,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // constant data. Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) .addReg(RegLo) - .addOperand(MI.getOperand(1))); + .add(MI.getOperand(1))); MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi); if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) MIB.addImm(0); else - MIB.addOperand(MI.getOperand(2)); + MIB.add(MI.getOperand(2)); Bundler.append(MIB); llvm::finalizeBundle(MBB, Bundler.begin()); @@ -1290,6 +1322,13 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, return Count; } +// Copy the flags onto the implicit condition register operand. +static void preserveCondRegFlags(MachineOperand &CondReg, + const MachineOperand &OrigCond) { + CondReg.setIsUndef(OrigCond.isUndef()); + CondReg.setIsKill(OrigCond.isKill()); +} + unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, @@ -1317,9 +1356,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, .addMBB(TBB); // Copy the flags onto the implicit condition register operand. - MachineOperand &CondReg = CondBr->getOperand(1); - CondReg.setIsUndef(Cond[1].isUndef()); - CondReg.setIsKill(Cond[1].isKill()); + preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); if (BytesAdded) *BytesAdded = 4; @@ -1351,6 +1388,157 @@ bool SIInstrInfo::reverseBranchCondition( return false; } +bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, + int &TrueCycles, int &FalseCycles) const { + switch (Cond[0].getImm()) { + case VCCNZ: + case VCCZ: { + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); + assert(MRI.getRegClass(FalseReg) == RC); + + int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? + + // Limit to equal cost for branch vs. N v_cndmask_b32s. + return !RI.isSGPRClass(RC) && NumInsts <= 6; + } + case SCC_TRUE: + case SCC_FALSE: { + // FIXME: We could insert for VGPRs if we could replace the original compare + // with a vector one. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); + assert(MRI.getRegClass(FalseReg) == RC); + + int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + + // Multiples of 8 can do s_cselect_b64 + if (NumInsts % 2 == 0) + NumInsts /= 2; + + CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? + return RI.isSGPRClass(RC); + } + default: + return false; + } +} + +void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const { + BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); + if (Pred == VCCZ || Pred == SCC_FALSE) { + Pred = static_cast<BranchPredicate>(-Pred); + std::swap(TrueReg, FalseReg); + } + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + unsigned DstSize = DstRC->getSize(); + + if (DstSize == 4) { + unsigned SelOp = Pred == SCC_TRUE ? + AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; + + // Instruction's operands are backwards from what is expected. + MachineInstr *Select = + BuildMI(MBB, I, DL, get(SelOp), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + return; + } + + if (DstSize == 8 && Pred == SCC_TRUE) { + MachineInstr *Select = + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + return; + } + + static const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + }; + + static const int16_t Sub0_15_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, + }; + + unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; + const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; + const int16_t *SubIndices = Sub0_15; + int NElts = DstSize / 4; + + // 64-bit select is only avaialble for SALU. + if (Pred == SCC_TRUE) { + SelOp = AMDGPU::S_CSELECT_B64; + EltRC = &AMDGPU::SGPR_64RegClass; + SubIndices = Sub0_15_64; + + assert(NElts % 2 == 0); + NElts /= 2; + } + + MachineInstrBuilder MIB = BuildMI( + MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); + + I = MIB->getIterator(); + + SmallVector<unsigned, 8> Regs; + for (int Idx = 0; Idx != NElts; ++Idx) { + unsigned DstElt = MRI.createVirtualRegister(EltRC); + Regs.push_back(DstElt); + + unsigned SubIdx = SubIndices[Idx]; + + MachineInstr *Select = + BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(FalseReg, 0, SubIdx) + .addReg(TrueReg, 0, SubIdx); + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + + MIB.addReg(DstElt) + .addImm(SubIdx); + } +} + +bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: { + // If there are additional implicit register operands, this may be used for + // register indexing so the source register operand isn't simply copied. + unsigned NumOps = MI.getDesc().getNumOperands() + + MI.getDesc().getNumImplicitUses(); + + return MI.getNumOperands() == NumOps; + } + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } +} + static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, @@ -1400,15 +1588,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { - bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; - - // Don't fold if we are using source modifiers. The new VOP2 instructions - // don't have them. - if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || - hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { + // Don't fold if we are using source or output modifiers. The new VOP2 + // instructions don't have them. + if (hasAnyModifiersSet(UseMI)) return false; - } const MachineOperand &ImmOp = DefMI.getOperand(1); @@ -1421,6 +1604,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (isInlineConstant(UseMI, *Src0, ImmOp)) return false; + bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -1633,20 +1817,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src0Mods = + getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src1Mods = + getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); + const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); + const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); return BuildMI(*MBB, MI, MI.getDebugLoc(), get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) - .addOperand(*Dst) - .addImm(0) // Src0 mods - .addOperand(*Src0) - .addImm(0) // Src1 mods - .addOperand(*Src1) + .add(*Dst) + .addImm(Src0Mods ? Src0Mods->getImm() : 0) + .add(*Src0) + .addImm(Src1Mods ? Src1Mods->getImm() : 0) + .add(*Src1) .addImm(0) // Src mods - .addOperand(*Src2) - .addImm(0) // clamp - .addImm(0); // omod + .add(*Src2) + .addImm(Clamp ? Clamp->getImm() : 0) + .addImm(Omod ? Omod->getImm() : 0); } // It's not generally safe to move VALU instructions across these since it will @@ -1687,7 +1877,8 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); case 16: - return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); default: llvm_unreachable("invalid bitwidth"); @@ -1705,24 +1896,43 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, // would be for any 32-bit integer operand, but would not be for a 64-bit one. int64_t Imm = MO.getImm(); - switch (operandBitWidth(OperandType)) { - case 32: { + switch (OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: { int32_t Trunc = static_cast<int32_t>(Imm); return Trunc == Imm && AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } - case 64: { + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: { return AMDGPU::isInlinableLiteral64(MO.getImm(), ST.hasInv2PiInlineImm()); } - case 16: { + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { + // A few special case instructions have 16-bit operands on subtargets + // where 16-bit instructions are not legal. + // TODO: Do the 32-bit immediates work? We shouldn't really need to handle + // constants in these cases int16_t Trunc = static_cast<int16_t>(Imm); - return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); } return false; } + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + uint32_t Trunc = static_cast<uint32_t>(Imm); + return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); + } default: llvm_unreachable("invalid bitwidth"); } @@ -1801,6 +2011,14 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, return Mods && Mods->getImm(); } +bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { + return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::clamp) || + hasModifiersSet(MI, AMDGPU::OpName::omod); +} + bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const { @@ -2238,7 +2456,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { unsigned Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); - BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO); + BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); MO.ChangeToRegister(Reg, false); } @@ -2564,8 +2782,8 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, return; unsigned DstReg = MRI.createVirtualRegister(DstRC); - MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg) - .addOperand(Op); + MachineInstr *Copy = + BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); Op.setReg(DstReg); Op.setSubReg(0); @@ -2810,13 +3028,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { // Regular buffer load / store. MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) + .add(*VData) .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset); + .add(*SRsrc) + .add(*SOffset) + .add(*Offset); // Atomics do not have this operand. if (const MachineOperand *GLC = @@ -2836,14 +3054,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { } else { // Atomics with return. Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addOperand(*VDataIn) + .add(*VData) + .add(*VDataIn) .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) + .add(*SRsrc) + .add(*SOffset) + .add(*Offset) .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } @@ -2970,6 +3188,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); + + case AMDGPU::S_PACK_LL_B32_B16: + case AMDGPU::S_PACK_LH_B32_B16: + case AMDGPU::S_PACK_HH_B32_B16: { + movePackToVALU(Worklist, MRI, Inst); + Inst.eraseFromParent(); + continue; + } } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -3027,12 +3253,15 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); unsigned NewDstReg = AMDGPU::NoRegister; if (HasDst) { + unsigned DstReg = Inst.getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + continue; + // Update the destination register class. const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); if (!NewDstRC) continue; - unsigned DstReg = Inst.getOperand(0).getReg(); if (Inst.isCopy() && TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { @@ -3112,15 +3341,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp( const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0); + BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1); + BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -3174,8 +3401,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp( unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0) - .addOperand(SrcReg1Sub0); + .add(SrcReg0Sub0) + .add(SrcReg1Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); @@ -3184,8 +3411,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp( unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1) - .addOperand(SrcReg1Sub1); + .add(SrcReg0Sub1) + .add(SrcReg1Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -3231,13 +3458,9 @@ void SIInstrInfo::splitScalar64BitBCNT( MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC); - BuildMI(MBB, MII, DL, InstDesc, MidReg) - .addOperand(SrcRegSub0) - .addImm(0); + BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); - BuildMI(MBB, MII, DL, InstDesc, ResultReg) - .addOperand(SrcRegSub1) - .addReg(MidReg); + BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); MRI.replaceRegWith(Dest.getReg(), ResultReg); @@ -3326,6 +3549,68 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } +void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, + MachineRegisterInfo &MRI, + MachineInstr &Inst) const { + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineBasicBlock *MBB = Inst.getParent(); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + + switch (Inst.getOpcode()) { + case AMDGPU::S_PACK_LL_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // FIXME: Can do a lot better if we know the high bits of src0 or src1 are + // 0. + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) + .addReg(ImmReg, RegState::Kill) + .add(Src0); + + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) + .add(Src1) + .addImm(16) + .addReg(TmpReg, RegState::Kill); + break; + } + case AMDGPU::S_PACK_LH_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) + .addReg(ImmReg, RegState::Kill) + .add(Src0) + .add(Src1); + break; + } + case AMDGPU::S_PACK_HH_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .add(Src0); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) + .add(Src1) + .addReg(ImmReg, RegState::Kill) + .addReg(TmpReg, RegState::Kill); + break; + } + default: + llvm_unreachable("unhandled s_pack_* instruction"); + } + + MachineOperand &Dest = Inst.getOperand(0); + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::addSCCDefUsersToVALUWorklist( MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { // This assumes that all the users of SCC are in the same block @@ -3448,10 +3733,13 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; if (ST.isAmdHsaOS()) { - RsrcDataFormat |= (1ULL << 56); + // Set ATC = 1. GFX9 doesn't have this bit. + if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) + RsrcDataFormat |= (1ULL << 56); - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) - // Set MTYPE = 2 + // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. + // BTW, it disables TC L2 and therefore decreases performance. + if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS) RsrcDataFormat |= (2ULL << 59); } @@ -3463,11 +3751,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size; - uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + // GFX9 doesn't have ELEMENT_SIZE. + if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) { + uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; + } - Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | - // IndexStride = 64 - (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); + // IndexStride = 64. + Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. @@ -3496,7 +3787,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, return AMDGPU::NoRegister; assert(!MI.memoperands_empty() && - (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); + (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); FrameIndex = Addr->getIndex(); return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); @@ -3552,16 +3843,11 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (DescSize != 0 && DescSize != 4) return DescSize; - if (Opc == AMDGPU::WAVE_BARRIER) - return 0; - // 4-byte instructions may have a 32-bit literal encoded after them. Check // operands that coud ever be literals. if (isVALU(MI) || isSALU(MI)) { - if (isFixedSize(MI)) { - assert(DescSize == 4); + if (isFixedSize(MI)) return DescSize; - } int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) @@ -3584,7 +3870,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return 4; switch (Opc) { - case AMDGPU::SI_MASK_BRANCH: case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: @@ -3609,7 +3894,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { return true; for (const MachineMemOperand *MMO : MI.memoperands()) { - if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) + if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) return true; } return false; @@ -3640,3 +3925,21 @@ ScheduleHazardRecognizer * SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { return new GCNHazardRecognizer(MF); } + +bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { + return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && + MI.modifiesRegister(AMDGPU::EXEC, &RI); +} + +MachineInstrBuilder +SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned DestReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead); +} diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index e68f6f92ba96..659473ca6a47 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -69,6 +69,9 @@ private: MachineInstr &Inst) const; void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const; + void movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, + MachineRegisterInfo &MRI, + MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist( unsigned Reg, MachineRegisterInfo &MRI, @@ -203,10 +206,24 @@ public: bool reverseBranchCondition( SmallVectorImpl<MachineOperand> &Cond) const override; + + bool canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, + int &TrueCycles, int &FalseCycles) const override; + + void insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const override; + bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; + bool isFoldableCopy(const MachineInstr &MI) const; + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const final; @@ -308,6 +325,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VOP3; } + static bool isSDWA(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SDWA; + } + + bool isSDWA(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SDWA; + } + static bool isVOPC(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::VOPC; } @@ -420,6 +445,22 @@ public: return get(Opcode).TSFlags & SIInstrFlags::DPP; } + static bool isVOP3P(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP3P; + } + + bool isVOP3P(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP3P; + } + + static bool isVINTRP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VINTRP; + } + + bool isVINTRP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VINTRP; + } + static bool isScalarUnit(const MachineInstr &MI) { return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD); } @@ -454,6 +495,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE; } + static bool hasFPClamp(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::HasFPClamp; + } + + bool hasFPClamp(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::HasFPClamp; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); @@ -462,28 +511,6 @@ public: return !RI.isSGPRReg(MRI, Dest); } - static int operandBitWidth(uint8_t OperandType) { - switch (OperandType) { - case AMDGPU::OPERAND_REG_IMM_INT32: - case AMDGPU::OPERAND_REG_IMM_FP32: - case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: - return 32; - case AMDGPU::OPERAND_REG_IMM_INT64: - case AMDGPU::OPERAND_REG_IMM_FP64: - case AMDGPU::OPERAND_REG_INLINE_C_INT64: - case AMDGPU::OPERAND_REG_INLINE_C_FP64: - return 64; - case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_IMM_FP16: - return 16; - default: - llvm_unreachable("unexpected operand type"); - } - } - bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; @@ -571,6 +598,7 @@ public: bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const; + bool hasAnyModifiersSet(const MachineInstr &MI) const; bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override; @@ -731,6 +759,17 @@ public: ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override; + + bool isBasicBlockPrologue(const MachineInstr &MI) const override; + + /// \brief Return a partially built integer add instruction without carry. + /// Caller must add source operands. + /// For pre-GFX9 it will generate unused carry destination operand. + /// TODO: After GFX9 it should return a no-carry operation. + MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned DestReg) const; }; namespace AMDGPU { @@ -741,6 +780,9 @@ namespace AMDGPU { int getVOPe32(uint16_t Opcode); LLVM_READONLY + int getSDWAOp(uint16_t Opcode); + + LLVM_READONLY int getCommuteRev(uint16_t Opcode); LLVM_READONLY diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index ebaefae3bfef..c6daf743f3ac 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -71,11 +71,6 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; -def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", - SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>, - SDTCisVT<3, i32>]> ->; - class SDSample<string opcode> : SDNode <opcode, SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>, SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> @@ -107,7 +102,7 @@ def SIld_local : SDNode <"ISD::LOAD", SDTLoad, >; def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ - return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ @@ -144,7 +139,7 @@ def SIst_local : SDNode <"ISD::STORE", SDTStore, def si_st_local : PatFrag < (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; def si_store_local : PatFrag < @@ -196,6 +191,21 @@ def si_uniform_br_scc : PatFrag < return isCBranchSCC(N); }]>; +def lshr_rev : PatFrag < + (ops node:$src1, node:$src0), + (srl $src0, $src1) +>; + +def ashr_rev : PatFrag < + (ops node:$src1, node:$src0), + (sra $src0, $src1) +>; + +def lshl_rev : PatFrag < + (ops node:$src1, node:$src0), + (shl $src0, $src1) +>; + multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> { def _glue : SDNode < @@ -266,10 +276,6 @@ def SIMM16bit : PatLeaf <(imm), [{return isInt<16>(N->getSExtValue());}] >; -def IMM20bit : PatLeaf <(imm), - [{return isUInt<20>(N->getZExtValue());}] ->; - class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ return isInlineImmediate(N); }]>; @@ -299,6 +305,19 @@ class VGPRImm <dag frag> : PatLeaf<frag, [{ return Limit < 10; }]>; +def NegateImm : SDNodeXForm<imm, [{ + return CurDAG->getConstant(-N->getSExtValue(), SDLoc(N), MVT::i32); +}]>; + +// TODO: When FP inline imm values work? +def NegSubInlineConst32 : ImmLeaf<i32, [{ + return Imm < -16 && Imm >= -64; +}], NegateImm>; + +def NegSubInlineConst16 : ImmLeaf<i16, [{ + return Imm < -16 && Imm >= -64; +}], NegateImm>; + //===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// @@ -449,6 +468,12 @@ class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> { let ParserMatchClass = MatchClass; } +class NamedOperandU32Default0<string Name, AsmOperandClass MatchClass> : + OperandWithDefaultOps<i32, (ops (i32 0))> { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; +} + let OperandType = "OPERAND_IMMEDIATE" in { def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>; @@ -486,6 +511,11 @@ def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>; def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>; def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>; +def op_sel : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>; +def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>; +def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>; +def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>; + def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>; def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { @@ -525,6 +555,7 @@ class FPInputModsMatchClass <int opSize> : AsmOperandClass { let ParserMethod = "parseRegOrImmWithFPInputMods"; let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods"; } + def FP16InputModsMatchClass : FPInputModsMatchClass<16>; def FP32InputModsMatchClass : FPInputModsMatchClass<32>; def FP64InputModsMatchClass : FPInputModsMatchClass<64>; @@ -577,6 +608,33 @@ def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> { let PrintMethod = "printOperandAndIntInputMods"; } +class PackedFPInputModsMatchClass <int opSize> : AsmOperandClass { + let Name = "PackedFP"#opSize#"InputMods"; + let ParserMethod = "parseRegOrImm"; + let PredicateMethod = "isRegOrImm"; +// let PredicateMethod = "isPackedFP"#opSize#"InputMods"; +} + +class PackedIntInputModsMatchClass <int opSize> : AsmOperandClass { + let Name = "PackedInt"#opSize#"InputMods"; + let ParserMethod = "parseRegOrImm"; + let PredicateMethod = "isRegOrImm"; +// let PredicateMethod = "isPackedInt"#opSize#"InputMods"; +} + +def PackedF16InputModsMatchClass : PackedFPInputModsMatchClass<16>; +def PackedI16InputModsMatchClass : PackedIntInputModsMatchClass<16>; + +class PackedFPInputMods <PackedFPInputModsMatchClass matchClass> : InputMods <matchClass> { +// let PrintMethod = "printPackedFPInputMods"; +} + +class PackedIntInputMods <PackedIntInputModsMatchClass matchClass> : InputMods <matchClass> { + //let PrintMethod = "printPackedIntInputMods"; +} + +def PackedF16InputMods : PackedFPInputMods<PackedF16InputModsMatchClass>; +def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>; //===----------------------------------------------------------------------===// // Complex patterns @@ -593,6 +651,14 @@ def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">; def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">; +// VOP3Mods, but the input source is known to never be NaN. +def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">; + +def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; + +def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; +def VOP3PMods0 : ComplexPattern<untyped, 3, "SelectVOP3PMods0">; + //===----------------------------------------------------------------------===// // SI assembler operands @@ -604,19 +670,32 @@ def SIOperand { int FLAT_SCR = 0x68; } +// This should be kept in sync with SISrcMods enum def SRCMODS { int NONE = 0; int NEG = 1; + int ABS = 2; + int NEG_ABS = 3; + + int NEG_HI = ABS; + int OP_SEL_0 = 4; + int OP_SEL_1 = 8; } def DSTCLAMP { int NONE = 0; + int ENABLE = 1; } def DSTOMOD { int NONE = 0; } +def TRAPID{ + int LLVM_TRAP = 2; + int LLVM_DEBUG_TRAP = 3; +} + //===----------------------------------------------------------------------===// // // SI Instruction multiclass helpers. @@ -648,8 +727,9 @@ class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon< ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3, exp_vm:$vm, exp_compr:$compr, i8imm:$en), "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm", - [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr), - f32:$src0, f32:$src1, f32:$src2, f32:$src3)]> { + [(node (i8 timm:$tgt), (i8 timm:$en), + f32:$src0, f32:$src1, f32:$src2, f32:$src3, + (i1 timm:$compr), (i1 timm:$vm))]> { let AsmMatchConverter = "cvtExp"; } @@ -666,6 +746,7 @@ multiclass EXP_m<bit done, SDPatternOperator node> { def _si : EXP_Helper<done>, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>, EXPe { + let AssemblerPredicates = [isSICI]; let DecoderNamespace = "SICI"; let DisableDecoder = DisableSIDecoder; } @@ -673,6 +754,7 @@ multiclass EXP_m<bit done, SDPatternOperator node> { def _vi : EXP_Helper<done>, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>, EXPe_vi { + let AssemblerPredicates = [isVI]; let DecoderNamespace = "VI"; let DisableDecoder = DisableVIDecoder; } @@ -706,12 +788,34 @@ class getVALUDstForVT<ValueType VT> { // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, v2f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, - 0))); - RegisterOperand ret = !if(isFP, - !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)), - !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32))); + 0)))); + + RegisterOperand ret = + !if(isFP, + !if(!eq(VT.Size, 64), + VSrc_f64, + !if(!eq(VT.Value, f16.Value), + VSrc_f16, + !if(!eq(VT.Value, v2f16.Value), + VCSrc_v2f16, + VSrc_f32 + ) + ) + ), + !if(!eq(VT.Size, 64), + VSrc_b64, + !if(!eq(VT.Value, i16.Value), + VSrc_b16, + !if(!eq(VT.Value, v2i16.Value), + VCSrc_v2b16, + VSrc_b32 + ) + ) + ) + ); } // Returns the vreg register class to use for source operand given VT @@ -725,25 +829,38 @@ class getVregSrcForVT<ValueType VT> { // given VT. class getVOP3SrcForVT<ValueType VT> { bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, v2f16.Value), 1, !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, - 0))); + 0)))); RegisterOperand ret = !if(!eq(VT.Size, 128), - VSrc_128, - !if(!eq(VT.Size, 64), + VSrc_128, + !if(!eq(VT.Size, 64), !if(isFP, - VCSrc_f64, - VCSrc_b64), + VCSrc_f64, + VCSrc_b64), !if(!eq(VT.Value, i1.Value), - SCSrc_b64, - !if(isFP, - !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32), - !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32) - ) - ) - ) - ); + SCSrc_b64, + !if(isFP, + !if(!eq(VT.Value, f16.Value), + VCSrc_f16, + !if(!eq(VT.Value, v2f16.Value), + VCSrc_v2f16, + VCSrc_f32 + ) + ), + !if(!eq(VT.Value, i16.Value), + VCSrc_b16, + !if(!eq(VT.Value, v2i16.Value), + VCSrc_v2b16, + VCSrc_b32 + ) + ) + ) + ) + ) + ); } // Returns 1 if the source arguments have modifiers, 0 if they do not. @@ -753,7 +870,8 @@ class isFloatType<ValueType SrcVT> { !if(!eq(SrcVT.Value, f16.Value), 1, !if(!eq(SrcVT.Value, f32.Value), 1, !if(!eq(SrcVT.Value, f64.Value), 1, - 0))); + !if(!eq(SrcVT.Value, v2f16.Value), 1, + 0)))); } class isIntType<ValueType SrcVT> { @@ -764,6 +882,23 @@ class isIntType<ValueType SrcVT> { 0))); } +class isPackedType<ValueType SrcVT> { + bit ret = + !if(!eq(SrcVT.Value, v2i16.Value), 1, + !if(!eq(SrcVT.Value, v2f16.Value), 1, 0) + ); +} + +// Float or packed int +class isModifierType<ValueType SrcVT> { + bit ret = + !if(!eq(SrcVT.Value, f16.Value), 1, + !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, + !if(!eq(SrcVT.Value, v2f16.Value), 1, + !if(!eq(SrcVT.Value, v2i16.Value), 1, + 0))))); +} // Return type of input modifiers operand for specified input operand class getSrcMod <ValueType VT> { @@ -771,6 +906,7 @@ class getSrcMod <ValueType VT> { !if(!eq(VT.Value, f32.Value), 1, !if(!eq(VT.Value, f64.Value), 1, 0))); + bit isPacked = isPackedType<VT>.ret; Operand ret = !if(!eq(VT.Size, 64), !if(isFP, FP64InputMods, Int64InputMods), !if(isFP, @@ -801,8 +937,8 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { // Returns the input arguments for VOP3 instructions for the given SrcVT. class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, RegisterOperand Src2RC, int NumSrcArgs, - bit HasModifiers, Operand Src0Mod, Operand Src1Mod, - Operand Src2Mod> { + bit HasModifiers, bit HasOMod, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { dag ret = !if (!eq(NumSrcArgs, 0), @@ -821,9 +957,13 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, !if (!eq(NumSrcArgs, 2), !if (!eq(HasModifiers, 1), // VOP 2 with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - clampmod:$clamp, omod:$omod) + !if( !eq(HasOMod, 1), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, omod:$omod), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp)) /* else */, // VOP2 without modifiers (ins Src0RC:$src0, Src1RC:$src1) @@ -831,16 +971,57 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, /* NumSrcArgs == 3 */, !if (!eq(HasModifiers, 1), // VOP3 with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2Mod:$src2_modifiers, Src2RC:$src2, - clampmod:$clamp, omod:$omod) + !if (!eq(HasOMod, 1), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp, omod:$omod), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp)) /* else */, // VOP3 without modifiers (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) /* endif */ )))); } +/// XXX - src1 may only allow VGPRs? + +// The modifiers (except clamp) are dummy operands for the benefit of +// printing and parsing. They defer their values to looking at the +// srcN_modifiers for what to print. +class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC, + RegisterOperand Src2RC, int NumSrcArgs, + bit HasClamp, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { + dag ret = !if (!eq(NumSrcArgs, 2), + !if (HasClamp, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi)), + // else NumSrcArgs == 3 + !if (HasClamp, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + op_sel:$op_sel, op_sel_hi:$op_sel_hi, + neg_lo:$neg_lo, neg_hi:$neg_hi)) + ); +} + class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, bit HasModifiers, Operand Src0Mod, Operand Src1Mod> { @@ -924,7 +1105,8 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> { // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. -class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> { +class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, + bit HasOMod, ValueType DstVT = i32> { string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); string src1 = !if(!eq(NumSrcArgs, 1), "", @@ -934,7 +1116,26 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = string ret = !if(!eq(HasModifiers, 0), getAsm32<HasDst, NumSrcArgs, DstVT>.ret, - dst#", "#src0#src1#src2#"$clamp"#"$omod"); + dst#", "#src0#src1#src2#"$clamp"#!if(HasOMod, "$omod", "")); +} + +// Returns the assembly string for the inputs and outputs of a VOP3P +// instruction. +class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers, + bit HasClamp, ValueType DstVT = i32> { + string dst = " $vdst"; + string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1", + " $src1,")); + string src2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); + + string mods = !if(HasModifiers, "$neg_lo$neg_hi", ""); + string clamp = !if(HasClamp, "$clamp", ""); + + // Each modifier is printed as an array of bits for each operand, so + // all operands are printed as part of src0_modifiers. + string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp; } class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> { @@ -1035,7 +1236,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret; field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret; field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret; - + field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); field bit HasDst32 = HasDst; @@ -1046,7 +1247,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1); // TODO: Modifiers logic is somewhat adhoc here, to be refined later - field bit HasModifiers = isFloatType<Src0VT>.ret; + field bit HasModifiers = isModifierType<Src0VT>.ret; field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret; field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret; @@ -1060,12 +1261,20 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0); field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0); - field bit HasOMod = HasModifiers; field bit HasClamp = HasModifiers; field bit HasSDWAClamp = HasSrc0; + field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret; + + field bit IsPacked = isPackedType<Src0VT>.ret; + field bit HasOpSel = IsPacked; + field bit HasOMod = !if(HasOpSel, 0, HasModifiers); field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; + field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); + field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); + field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods); + field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs)); // VOP3b instructions are a special case with a second explicit @@ -1077,7 +1286,12 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, - HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret; + HasModifiers, HasOMod, Src0Mod, Src1Mod, + Src2Mod>.ret; + field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64, + NumSrcArgs, HasClamp, + Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret; + field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP>.ret; field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, @@ -1085,7 +1299,8 @@ class VOPProfile <list<ValueType> _ArgVT> { DstVT>.ret; field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret; - field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; + field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret; + field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret; field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; } @@ -1101,11 +1316,18 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; -def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>; -def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; +def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; +def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>; +def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>; +def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>; + +def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>; +def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>; + def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; @@ -1117,6 +1339,8 @@ def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; +def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>; +def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>; def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>; def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; @@ -1126,6 +1350,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; @@ -1213,6 +1438,15 @@ def getVOPe32 : InstrMapping { let ValueCols = [["4", "0"]]; } +// Maps ordinary instructions to their SDWA counterparts +def getSDWAOp : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["SDWA"]]; +} + def getMaskedMIMGOp : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 38e31e75ee67..2f89503e129a 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -111,6 +111,12 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] +def S_TRAP_PSEUDO : SPseudoInstSI <(outs), (ins i16imm:$simm16)> { + let hasSideEffects = 1; + let SALU = 1; + let usesCustomInserter = 1; +} + let usesCustomInserter = 1, SALU = 1 in { def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; @@ -146,6 +152,8 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), let mayStore = 1; let isBarrier = 1; let isConvergent = 1; + let FixedSize = 1; + let Size = 0; } // SI pseudo instructions. These are used by the CFG structurizer pass @@ -153,48 +161,44 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), // Dummy terminator instruction to use after control flow instructions // replaced with exec mask operations. -def SI_MASK_BRANCH : PseudoInstSI < +def SI_MASK_BRANCH : VPseudoInstSI < (outs), (ins brtarget:$target)> { let isBranch = 0; let isTerminator = 1; let isBarrier = 0; - let Uses = [EXEC]; let SchedRW = []; let hasNoSchedulingInfo = 1; + let FixedSize = 1; + let Size = 0; } let isTerminator = 1 in { def SI_IF: CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), - [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> { + [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { let Constraints = ""; let Size = 12; - let mayLoad = 1; - let mayStore = 1; let hasSideEffects = 1; } def SI_ELSE : CFPseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { + (outs SReg_64:$dst), + (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { let Constraints = "$src = $dst"; let Size = 12; - let mayStore = 1; - let mayLoad = 1; let hasSideEffects = 1; } def SI_LOOP : CFPseudoInstSI < (outs), (ins SReg_64:$saved, brtarget:$target), - [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> { + [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> { let Size = 8; - let isBranch = 1; + let isBranch = 0; let hasSideEffects = 1; - let mayLoad = 1; - let mayStore = 1; } -} // End isBranch = 1, isTerminator = 1 +} // End isTerminator = 1 def SI_END_CF : CFPseudoInstSI < (outs), (ins SReg_64:$saved), @@ -202,9 +206,9 @@ def SI_END_CF : CFPseudoInstSI < let Size = 4; let isAsCheapAsAMove = 1; let isReMaterializable = 1; - let mayLoad = 1; - let mayStore = 1; let hasSideEffects = 1; + let mayLoad = 1; // FIXME: Should not need memory flags + let mayStore = 1; } def SI_BREAK : CFPseudoInstSI < @@ -244,6 +248,10 @@ def SI_KILL_TERMINATOR : SPseudoInstSI < let isTerminator = 1; } +def SI_ILLEGAL_COPY : SPseudoInstSI < + (outs unknown:$dst), (ins unknown:$src), + [], " ; illegal copy $src to $dst">; + } // End Uses = [EXEC], Defs = [EXEC,VCC] // Branch on undef scc. Used to avoid intermediate copy from @@ -259,6 +267,14 @@ def SI_PS_LIVE : PseudoInstSI < let SALU = 1; } +def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), + [(int_amdgcn_unreachable)], + "; divergent unreachable"> { + let Size = 0; + let hasNoSchedulingInfo = 1; + let FixedSize = 1; +} + // Used as an isel pseudo to directly emit initialization with an // s_mov_b32 rather than a copy of another initialized // register. MachineCSE skips copies, and we don't want to have to @@ -270,12 +286,12 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { let isReMaterializable = 1; } -def SI_RETURN : SPseudoInstSI < - (outs), (ins variable_ops), [(AMDGPUreturn)]> { +// Return for returning shaders to a shader variant epilog. +def SI_RETURN_TO_EPILOG : SPseudoInstSI < + (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { let isTerminator = 1; let isBarrier = 1; let isReturn = 1; - let hasSideEffects = 1; let hasNoSchedulingInfo = 1; let DisableWQM = 1; } @@ -383,9 +399,18 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < } // End SubtargetPredicate = isGCN let Predicates = [isGCN] in { +def : Pat< + (trap), + (S_TRAP_PSEUDO TRAPID.LLVM_TRAP) +>; def : Pat< - (int_amdgcn_else i64:$src, bb:$target), + (debugtrap), + (S_TRAP_PSEUDO TRAPID.LLVM_DEBUG_TRAP) +>; + +def : Pat< + (AMDGPUelse i64:$src, bb:$target), (SI_ELSE $src, $target, 0) >; @@ -423,24 +448,37 @@ def : Pat < } // End Predicates = [UnsafeFPMath] + +// f16_to_fp patterns def : Pat < - (f32 (fpextend f16:$src)), - (V_CVT_F32_F16_e32 $src) + (f32 (f16_to_fp i32:$src0)), + (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; def : Pat < - (f64 (fpextend f16:$src)), - (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) + (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), + (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : Pat < + (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), + (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : Pat < + (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), + (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; def : Pat < - (f16 (fpround f32:$src)), - (V_CVT_F16_F32_e32 $src) + (f64 (fpextend f16:$src)), + (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) >; +// fp_to_fp16 patterns def : Pat < - (f16 (fpround f64:$src)), - (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src)) + (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))), + (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod) >; def : Pat < @@ -480,6 +518,16 @@ multiclass FMADPat <ValueType vt, Instruction inst> { defm : FMADPat <f16, V_MAC_F16_e64>; defm : FMADPat <f32, V_MAC_F32_e64>; +class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : Pat< + (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod), + (VOP3Mods f32:$src1, i32:$src1_mod), + (VOP3Mods f32:$src2, i32:$src2_mod))), + (inst $src0_mod, $src0, $src1_mod, $src1, + $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>; + multiclass SelectPat <ValueType vt, Instruction inst> { def : Pat < (vt (select i1:$src0, vt:$src1, vt:$src2)), @@ -578,6 +626,16 @@ def : BitConvert <i32, f32, VGPR_32>; def : BitConvert <f32, i32, VGPR_32>; def : BitConvert <i32, f32, SReg_32>; def : BitConvert <f32, i32, SReg_32>; +def : BitConvert <v2i16, i32, SReg_32>; +def : BitConvert <i32, v2i16, SReg_32>; +def : BitConvert <v2f16, i32, SReg_32>; +def : BitConvert <i32, v2f16, SReg_32>; +def : BitConvert <v2i16, v2f16, SReg_32>; +def : BitConvert <v2f16, v2i16, SReg_32>; +def : BitConvert <v2f16, f32, SReg_32>; +def : BitConvert <f32, v2f16, SReg_32>; +def : BitConvert <v2i16, f32, SReg_32>; +def : BitConvert <f32, v2i16, SReg_32>; // 64-bit bitcast def : BitConvert <i64, f64, VReg_64>; @@ -619,12 +677,20 @@ def : BitConvert <v16f32, v16i32, VReg_512>; /********** Src & Dst modifiers **********/ /********** =================== **********/ -def : Pat < - (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), - (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod) + +// If denormals are not enabled, it only impacts the compare of the +// inputs. The output result is not flushed. +class ClampPat<Instruction inst, ValueType vt> : Pat < + (vt (AMDGPUclamp + (VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))), + (inst i32:$src0_modifiers, vt:$src0, + i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod) >; +def : ClampPat<V_MAX_F32_e64, f32>; +def : ClampPat<V_MAX_F64, f64>; +def : ClampPat<V_MAX_F16_e64, f16>; + /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ @@ -678,6 +744,37 @@ def : Pat < >; def : Pat < + (fcopysign f16:$src0, f16:$src1), + (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) +>; + +def : Pat < + (fcopysign f32:$src0, f16:$src1), + (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0, + (V_LSHLREV_B32_e64 (i32 16), $src1)) +>; + +def : Pat < + (fcopysign f64:$src0, f16:$src1), + (REG_SEQUENCE SReg_64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), + (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1) +>; + +def : Pat < + (fcopysign f16:$src0, f32:$src1), + (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, + (V_LSHRREV_B32_e64 (i32 16), $src1)) +>; + +def : Pat < + (fcopysign f16:$src0, f64:$src1), + (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, + (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) +>; + +def : Pat < (fneg f16:$src), (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000))) >; @@ -692,6 +789,25 @@ def : Pat < (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit >; +def : Pat < + (fneg v2f16:$src), + (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src) +>; + +def : Pat < + (fabs v2f16:$src), + (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src) +>; + +// This is really (fneg (fabs v2f16:$src)) +// +// fabs is not reported as free because there is modifier for it in +// VOP3P instructions, so it is turned into the bit op. +def : Pat < + (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))), + (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit +>; + /********** ================== **********/ /********** Immediate Patterns **********/ /********** ================== **********/ @@ -759,27 +875,6 @@ def : Pat < def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; def : Pat < - (int_AMDGPU_cube v4f32:$src), - (REG_SEQUENCE VReg_128, - (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)), - 0 /* clamp */, 0 /* omod */), sub0, - (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), - 0 /* clamp */, 0 /* omod */), sub1, - (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), - 0 /* clamp */, 0 /* omod */), sub2, - (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), - 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), - 0 /* clamp */, 0 /* omod */), sub3) ->; - -def : Pat < (i32 (sext i1:$src0)), (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) >; @@ -985,6 +1080,11 @@ def : Pat < //===----------------------------------------------------------------------===// // Miscellaneous Patterns //===----------------------------------------------------------------------===// +def : Pat < + (i32 (AMDGPUfp16_zext f16:$src)), + (COPY $src) +>; + def : Pat < (i32 (trunc i64:$a)), @@ -1028,24 +1128,72 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; +defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>; -def : BFEPattern <V_BFE_U32, S_MOV_B32>; +def : Pat< + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0) +>; def : Pat< - (fcanonicalize f16:$src), - (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0) + (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), + (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0) >; def : Pat< - (fcanonicalize f32:$src), - (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0) + (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), + (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0) >; def : Pat< - (fcanonicalize f64:$src), - (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0) + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) +>; + + +// Allow integer inputs +class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat< + (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)), + (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en) +>; + +def : ExpPattern<AMDGPUexport, i32, EXP>; +def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>; + +def : Pat < + (v2i16 (build_vector i16:$src0, i16:$src1)), + (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) +>; + +// With multiple uses of the shift, this will duplicate the shift and +// increase register pressure. +def : Pat < + (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), + (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1)) >; +def : Pat < + (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))), + (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), + (v2i16 (S_PACK_HH_B32_B16 $src0, $src1)) +>; + +// TODO: Should source modifiers be matched to v_pack_b32_f16? +def : Pat < + (v2f16 (build_vector f16:$src0, f16:$src1)), + (v2f16 (S_PACK_LL_B32_B16 $src0, $src1)) +>; + +// def : Pat < +// (v2f16 (scalar_to_vector f16:$src0)), +// (COPY $src0) +// >; + +// def : Pat < +// (v2i16 (scalar_to_vector i16:$src0)), +// (COPY $src0) +// >; + //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// @@ -1083,11 +1231,39 @@ def : Pat < // Miscellaneous Optimization Patterns //============================================================================// +// Undo sub x, c -> add x, -c canonicalization since c is more likely +// an inline immediate than -c. +// TODO: Also do for 64-bit. +def : Pat< + (add i32:$src0, (i32 NegSubInlineConst32:$src1)), + (S_SUB_I32 $src0, NegSubInlineConst32:$src1) +>; + def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>; def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>; +// This matches 16 permutations of +// max(min(x, y), min(max(x, y), z)) +class FPMed3Pat<ValueType vt, + Instruction med3Inst> : Pat< + (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), + (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : FPMed3Pat<f32, V_MED3_F32>; + +let Predicates = [isGFX9] in { +def : FPMed3Pat<f16, V_MED3_F16>; +def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>; +def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>; +} // End Predicates = [isGFX9] + //============================================================================// // Assembler aliases //============================================================================// diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td index 5da375468713..7b7cf1635050 100644 --- a/lib/Target/AMDGPU/SIIntrinsics.td +++ b/lib/Target/AMDGPU/SIIntrinsics.td @@ -14,23 +14,7 @@ let TargetPrefix = "SI", isTarget = 1 in { - def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - - def int_SI_export : Intrinsic <[], - [llvm_i32_ty, // en - llvm_i32_ty, // vm (FIXME: should be i1) - llvm_i32_ty, // done (FIXME: should be i1) - llvm_i32_ty, // tgt - llvm_i32_ty, // compr (FIXME: should be i1) - llvm_float_ty, // src0 - llvm_float_ty, // src1 - llvm_float_ty, // src2 - llvm_float_ty], // src3 - [] - >; - def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed def int_SI_tbuffer_store : Intrinsic < @@ -64,146 +48,4 @@ let TargetPrefix = "SI", isTarget = 1 in { llvm_i32_ty], // tfe(imm) [IntrReadMem, IntrArgMemOnly]>; - def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>; - - // Fully-flexible SAMPLE instruction. - class SampleRaw : Intrinsic < - [llvm_v4f32_ty], // vdata(VGPR) - [llvm_anyint_ty, // vaddr(VGPR) - llvm_v8i32_ty, // rsrc(SGPR) - llvm_v4i32_ty, // sampler(SGPR) - llvm_i32_ty, // dmask(imm) - llvm_i32_ty, // unorm(imm) - llvm_i32_ty, // r128(imm) - llvm_i32_ty, // da(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty, // tfe(imm) - llvm_i32_ty], // lwe(imm) - [IntrNoMem]>; - - // Image instruction without a sampler. - class Image : Intrinsic < - [llvm_v4f32_ty], // vdata(VGPR) - [llvm_anyint_ty, // vaddr(VGPR) - llvm_v8i32_ty, // rsrc(SGPR) - llvm_i32_ty, // dmask(imm) - llvm_i32_ty, // unorm(imm) - llvm_i32_ty, // r128(imm) - llvm_i32_ty, // da(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty, // tfe(imm) - llvm_i32_ty], // lwe(imm) - [IntrNoMem]>; - - // Basic sample - def int_SI_image_sample : SampleRaw; - def int_SI_image_sample_cl : SampleRaw; - def int_SI_image_sample_d : SampleRaw; - def int_SI_image_sample_d_cl : SampleRaw; - def int_SI_image_sample_l : SampleRaw; - def int_SI_image_sample_b : SampleRaw; - def int_SI_image_sample_b_cl : SampleRaw; - def int_SI_image_sample_lz : SampleRaw; - def int_SI_image_sample_cd : SampleRaw; - def int_SI_image_sample_cd_cl : SampleRaw; - - // Sample with comparison - def int_SI_image_sample_c : SampleRaw; - def int_SI_image_sample_c_cl : SampleRaw; - def int_SI_image_sample_c_d : SampleRaw; - def int_SI_image_sample_c_d_cl : SampleRaw; - def int_SI_image_sample_c_l : SampleRaw; - def int_SI_image_sample_c_b : SampleRaw; - def int_SI_image_sample_c_b_cl : SampleRaw; - def int_SI_image_sample_c_lz : SampleRaw; - def int_SI_image_sample_c_cd : SampleRaw; - def int_SI_image_sample_c_cd_cl : SampleRaw; - - // Sample with offsets - def int_SI_image_sample_o : SampleRaw; - def int_SI_image_sample_cl_o : SampleRaw; - def int_SI_image_sample_d_o : SampleRaw; - def int_SI_image_sample_d_cl_o : SampleRaw; - def int_SI_image_sample_l_o : SampleRaw; - def int_SI_image_sample_b_o : SampleRaw; - def int_SI_image_sample_b_cl_o : SampleRaw; - def int_SI_image_sample_lz_o : SampleRaw; - def int_SI_image_sample_cd_o : SampleRaw; - def int_SI_image_sample_cd_cl_o : SampleRaw; - - // Sample with comparison and offsets - def int_SI_image_sample_c_o : SampleRaw; - def int_SI_image_sample_c_cl_o : SampleRaw; - def int_SI_image_sample_c_d_o : SampleRaw; - def int_SI_image_sample_c_d_cl_o : SampleRaw; - def int_SI_image_sample_c_l_o : SampleRaw; - def int_SI_image_sample_c_b_o : SampleRaw; - def int_SI_image_sample_c_b_cl_o : SampleRaw; - def int_SI_image_sample_c_lz_o : SampleRaw; - def int_SI_image_sample_c_cd_o : SampleRaw; - def int_SI_image_sample_c_cd_cl_o : SampleRaw; - - // Basic gather4 - def int_SI_gather4 : SampleRaw; - def int_SI_gather4_cl : SampleRaw; - def int_SI_gather4_l : SampleRaw; - def int_SI_gather4_b : SampleRaw; - def int_SI_gather4_b_cl : SampleRaw; - def int_SI_gather4_lz : SampleRaw; - - // Gather4 with comparison - def int_SI_gather4_c : SampleRaw; - def int_SI_gather4_c_cl : SampleRaw; - def int_SI_gather4_c_l : SampleRaw; - def int_SI_gather4_c_b : SampleRaw; - def int_SI_gather4_c_b_cl : SampleRaw; - def int_SI_gather4_c_lz : SampleRaw; - - // Gather4 with offsets - def int_SI_gather4_o : SampleRaw; - def int_SI_gather4_cl_o : SampleRaw; - def int_SI_gather4_l_o : SampleRaw; - def int_SI_gather4_b_o : SampleRaw; - def int_SI_gather4_b_cl_o : SampleRaw; - def int_SI_gather4_lz_o : SampleRaw; - - // Gather4 with comparison and offsets - def int_SI_gather4_c_o : SampleRaw; - def int_SI_gather4_c_cl_o : SampleRaw; - def int_SI_gather4_c_l_o : SampleRaw; - def int_SI_gather4_c_b_o : SampleRaw; - def int_SI_gather4_c_b_cl_o : SampleRaw; - def int_SI_gather4_c_lz_o : SampleRaw; - - def int_SI_getlod : SampleRaw; - - // Image instrinsics. - def int_SI_image_load : Image; - def int_SI_image_load_mip : Image; - def int_SI_getresinfo : Image; - - /* Interpolation Intrinsics */ - - def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; } // End TargetPrefix = "SI", isTarget = 1 - -let TargetPrefix = "amdgcn", isTarget = 1 in { - // Emit 2.5 ulp, no denormal division. Should only be inserted by - // pass based on !fpmath metadata. - def int_amdgcn_fdiv_fast : Intrinsic< - [llvm_float_ty], [llvm_float_ty], [IntrNoMem] - >; - - /* Control flow Intrinsics */ - - def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>; - def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>; - def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>; - def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>; - def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>; - def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>; - def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>; -} diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 99fe96c0be22..933a16646746 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -39,15 +39,27 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/LiveVariables.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include <cassert> +#include <iterator> +#include <utility> using namespace llvm; @@ -56,39 +68,36 @@ using namespace llvm; namespace { class SILoadStoreOptimizer : public MachineFunctionPass { + + typedef struct { + MachineBasicBlock::iterator I; + MachineBasicBlock::iterator Paired; + unsigned EltSize; + unsigned Offset0; + unsigned Offset1; + unsigned BaseOff; + bool UseST64; + SmallVector<MachineInstr*, 8> InstsToMove; + } CombineInfo; + private: - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - MachineRegisterInfo *MRI; - AliasAnalysis *AA; - - static bool offsetsCanBeCombined(unsigned Offset0, - unsigned Offset1, - unsigned EltSize); - - MachineBasicBlock::iterator findMatchingDSInst( - MachineBasicBlock::iterator I, - unsigned EltSize, - SmallVectorImpl<MachineInstr*> &InstsToMove); - - MachineBasicBlock::iterator mergeRead2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize, - ArrayRef<MachineInstr*> InstsToMove); - - MachineBasicBlock::iterator mergeWrite2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize, - ArrayRef<MachineInstr*> InstsToMove); + const SIInstrInfo *TII = nullptr; + const SIRegisterInfo *TRI = nullptr; + MachineRegisterInfo *MRI = nullptr; + AliasAnalysis *AA = nullptr; + + static bool offsetsCanBeCombined(CombineInfo &CI); + + bool findMatchingDSInst(CombineInfo &CI); + + MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); + + MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); public: static char ID; - SILoadStoreOptimizer() - : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), - AA(nullptr) {} + SILoadStoreOptimizer() : MachineFunctionPass(ID) {} SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); @@ -108,7 +117,7 @@ public: } }; -} // End anonymous namespace. +} // end anonymous namespace. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) @@ -141,11 +150,10 @@ static void addDefsToList(const MachineInstr &MI, } } -static bool memAccessesCanBeReordered( - MachineBasicBlock::iterator A, - MachineBasicBlock::iterator B, - const SIInstrInfo *TII, - llvm::AliasAnalysis * AA) { +static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, + MachineBasicBlock::iterator B, + const SIInstrInfo *TII, + AliasAnalysis * AA) { return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) || // RAW or WAR - cannot reorder // WAW - cannot reorder @@ -179,7 +187,6 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef<MachineInstr*> InstsToMove, const SIInstrInfo *TII, AliasAnalysis *AA) { - assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { @@ -191,47 +198,68 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp, return true; } -bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, - unsigned Offset1, - unsigned Size) { +bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { // XXX - Would the same offset be OK? Is there any reason this would happen or // be useful? - if (Offset0 == Offset1) + if (CI.Offset0 == CI.Offset1) return false; // This won't be valid if the offset isn't aligned. - if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) + if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) return false; - unsigned EltOffset0 = Offset0 / Size; - unsigned EltOffset1 = Offset1 / Size; + unsigned EltOffset0 = CI.Offset0 / CI.EltSize; + unsigned EltOffset1 = CI.Offset1 / CI.EltSize; + CI.UseST64 = false; + CI.BaseOff = 0; + + // If the offset in elements doesn't fit in 8-bits, we might be able to use + // the stride 64 versions. + if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && + isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { + CI.Offset0 = EltOffset0 / 64; + CI.Offset1 = EltOffset1 / 64; + CI.UseST64 = true; + return true; + } // Check if the new offsets fit in the reduced 8-bit range. - if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) + if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { + CI.Offset0 = EltOffset0; + CI.Offset1 = EltOffset1; return true; + } - // If the offset in elements doesn't fit in 8-bits, we might be able to use - // the stride 64 versions. - if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) - return false; + // Try to shift base address to decrease offsets. + unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); + CI.BaseOff = std::min(CI.Offset0, CI.Offset1); + + if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { + CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; + CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; + CI.UseST64 = true; + return true; + } + + if (isUInt<8>(OffsetDiff)) { + CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; + CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; + return true; + } - return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); + return false; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize, - SmallVectorImpl<MachineInstr*> &InstsToMove) { - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineBasicBlock::iterator MBBI = I; +bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) { + MachineBasicBlock::iterator E = CI.I->getParent()->end(); + MachineBasicBlock::iterator MBBI = CI.I; ++MBBI; SmallVector<const MachineOperand *, 8> DefsToMove; - addDefsToList(*I, DefsToMove); + addDefsToList(*CI.I, DefsToMove); for ( ; MBBI != E; ++MBBI) { - - if (MBBI->getOpcode() != I->getOpcode()) { + if (MBBI->getOpcode() != CI.I->getOpcode()) { // This is not a matching DS instruction, but we can keep looking as // long as one of these conditions are met: @@ -242,14 +270,14 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, if (MBBI->hasUnmodeledSideEffects()) // We can't re-order this instruction with respect to other memory // opeations, so we fail both conditions mentioned above. - return E; + return false; if (MBBI->mayLoadOrStore() && - !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) { + !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. - InstsToMove.push_back(&*MBBI); + CI.InstsToMove.push_back(&*MBBI); addDefsToList(*MBBI, DefsToMove); continue; } @@ -257,13 +285,13 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, // When we match I with another DS instruction we will be moving I down // to the location of the matched instruction any uses of I will need to // be moved down as well. - addToListsIfDependent(*MBBI, DefsToMove, InstsToMove); + addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove); continue; } // Don't merge volatiles. if (MBBI->hasOrderedMemoryRef()) - return E; + return false; // Handle a case like // DS_WRITE_B32 addr, v, idx0 @@ -271,77 +299,67 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, // DS_WRITE_B32 addr, f(w), idx1 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents // merging of the two writes. - if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove)) + if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove)) continue; - int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); - const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), + AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx); const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); // Check same base pointer. Be careful of subregisters, which can occur with // vectors of pointers. if (AddrReg0.getReg() == AddrReg1.getReg() && AddrReg0.getSubReg() == AddrReg1.getSubReg()) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); - unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; - unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff; + CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + CI.Paired = MBBI; // Check both offsets fit in the reduced range. // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && - canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) - return MBBI; + if (offsetsCanBeCombined(CI)) + if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + return true; } // We've found a load/store that we couldn't merge for some reason. // We could potentially keep looking, but we'd need to make sure that // it was safe to move I and also all the instruction in InstsToMove // down past this instruction. - if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI - !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users - ) + // check if we can move I across MBBI and if we can move all I's users + if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) break; } - return E; + return false; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize, - ArrayRef<MachineInstr*> InstsToMove) { - MachineBasicBlock *MBB = I->getParent(); + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird // cases, like vectors of pointers. - const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - - const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); - const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); - - unsigned Offset0 - = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; - unsigned Offset1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; - - unsigned NewOffset0 = Offset0 / EltSize; - unsigned NewOffset1 = Offset1 / EltSize; - unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; - - // Prefer the st64 form if we can use it, even if we can fit the offset in the - // non st64 version. I'm not sure if there's any real reason to do this. - bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); - if (UseST64) { - NewOffset0 /= 64; - NewOffset1 /= 64; - Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; - } + const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); + const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst); + + unsigned NewOffset0 = CI.Offset0; + unsigned NewOffset1 = CI.Offset1; + unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32 + : AMDGPU::DS_READ2_B64; + + if (CI.UseST64) + Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 + : AMDGPU::DS_READ2ST64_B64; - unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; - unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -356,72 +374,70 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( const MCInstrDesc &Read2Desc = TII->get(Opc); const TargetRegisterClass *SuperRC - = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); - DebugLoc DL = I->getDebugLoc(); - MachineInstrBuilder Read2 - = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) - .addOperand(*AddrReg) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .addMemOperand(*I->memoperands_begin()) - .addMemOperand(*Paired->memoperands_begin()); + DebugLoc DL = CI.I->getDebugLoc(); + + unsigned BaseReg = AddrReg->getReg(); + unsigned BaseRegFlags = 0; + if (CI.BaseOff) { + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BaseRegFlags = RegState::Kill; + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) + .addImm(CI.BaseOff) + .addReg(AddrReg->getReg()); + } + + MachineInstrBuilder Read2 = + BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) + .addReg(BaseReg, BaseRegFlags) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + (void)Read2; const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. - BuildMI(*MBB, Paired, DL, CopyDesc) - .addOperand(*Dest0) // Copy to same destination including flags and sub reg. - .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) - .addOperand(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); + BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, InstsToMove); + moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(I); - I->eraseFromParent(); - Paired->eraseFromParent(); + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); return Next; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize, - ArrayRef<MachineInstr*> InstsToMove) { - MachineBasicBlock *MBB = I->getParent(); + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); + const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); const MachineOperand *Data1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); + = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); + unsigned NewOffset0 = CI.Offset0; + unsigned NewOffset1 = CI.Offset1; + unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32 + : AMDGPU::DS_WRITE2_B64; - unsigned Offset0 - = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; - unsigned Offset1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; - - unsigned NewOffset0 = Offset0 / EltSize; - unsigned NewOffset1 = Offset1 / EltSize; - unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; - - // Prefer the st64 form if we can use it, even if we can fit the offset in the - // non st64 version. I'm not sure if there's any real reason to do this. - bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); - if (UseST64) { - NewOffset0 /= 64; - NewOffset1 /= 64; - Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; - } + if (CI.UseST64) + Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 + : AMDGPU::DS_WRITE2ST64_B64; if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -434,24 +450,33 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); - DebugLoc DL = I->getDebugLoc(); + DebugLoc DL = CI.I->getDebugLoc(); + + unsigned BaseReg = Addr->getReg(); + unsigned BaseRegFlags = 0; + if (CI.BaseOff) { + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BaseRegFlags = RegState::Kill; + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) + .addImm(CI.BaseOff) + .addReg(Addr->getReg()); + } - MachineInstrBuilder Write2 - = BuildMI(*MBB, Paired, DL, Write2Desc) - .addOperand(*Addr) // addr - .addOperand(*Data0) // data0 - .addOperand(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .addMemOperand(*I->memoperands_begin()) - .addMemOperand(*Paired->memoperands_begin()); + MachineInstrBuilder Write2 = + BuildMI(*MBB, CI.Paired, DL, Write2Desc) + .addReg(BaseReg, BaseRegFlags) // addr + .add(*Data0) // data0 + .add(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); - moveInstsAfter(Write2, InstsToMove); + moveInstsAfter(Write2, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(I); - I->eraseFromParent(); - Paired->eraseFromParent(); + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); return Next; @@ -472,27 +497,24 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } - SmallVector<MachineInstr*, 8> InstsToMove; + CombineInfo CI; + CI.I = I; unsigned Opc = MI.getOpcode(); if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { - unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, - InstsToMove); - if (Match != E) { + CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; + if (findMatchingDSInst(CI)) { Modified = true; - I = mergeRead2Pair(I, Match, Size, InstsToMove); + I = mergeRead2Pair(CI); } else { ++I; } continue; } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { - unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, - InstsToMove); - if (Match != E) { + CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; + if (findMatchingDSInst(CI)) { Modified = true; - I = mergeWrite2Pair(I, Match, Size, InstsToMove); + I = mergeWrite2Pair(CI); } else { ++I; } diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 7ed18f27e591..35d3a93d8710 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -51,13 +51,23 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/LivePhysRegs.h" -#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <cassert> +#include <iterator> using namespace llvm; @@ -67,10 +77,10 @@ namespace { class SILowerControlFlow : public MachineFunctionPass { private: - const SIRegisterInfo *TRI; - const SIInstrInfo *TII; - LiveIntervals *LIS; - MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + LiveIntervals *LIS = nullptr; + MachineRegisterInfo *MRI = nullptr; void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); @@ -88,12 +98,7 @@ private: public: static char ID; - SILowerControlFlow() : - MachineFunctionPass(ID), - TRI(nullptr), - TII(nullptr), - LIS(nullptr), - MRI(nullptr) {} + SILowerControlFlow() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -113,7 +118,7 @@ public: } }; -} // End anonymous namespace +} // end anonymous namespace char SILowerControlFlow::ID = 0; @@ -175,9 +180,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // Insert a pseudo terminator to help keep the verifier happy. This will also // be used later when inserting skips. - MachineInstr *NewBr = - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addOperand(MI.getOperand(2)); + MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .add(MI.getOperand(2)); if (!LIS) { MI.eraseFromParent(); @@ -220,8 +224,9 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { // tied. In order to correctly tie the registers, split this into a copy of // the src like it does. unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg) - .addOperand(MI.getOperand(1)); // Saved EXEC + MachineInstr *CopyExec = + BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg) + .add(MI.getOperand(1)); // Saved EXEC // This must be inserted before phis and any spill code inserted before the // else. @@ -262,6 +267,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { LIS->RemoveMachineInstrFromMaps(MI); MI.eraseFromParent(); + LIS->InsertMachineInstrInMaps(*CopyExec); LIS->InsertMachineInstrInMaps(*OrSaveExec); LIS->InsertMachineInstrInMaps(*Xor); @@ -283,10 +289,9 @@ void SILowerControlFlow::emitBreak(MachineInstr &MI) { const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - MachineInstr *Or = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(AMDGPU::EXEC) - .addOperand(MI.getOperand(1)); + MachineInstr *Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(1)); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *Or); @@ -306,13 +311,13 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *AndN2 = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addOperand(MI.getOperand(0)); + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(0)); MachineInstr *Branch = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addOperand(MI.getOperand(1)); + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .add(MI.getOperand(1)); if (LIS) { LIS->ReplaceMachineInstrInMaps(MI, *AndN2); @@ -328,9 +333,9 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock::iterator InsPt = MBB.begin(); MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addOperand(MI.getOperand(0)); + BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(0)); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *NewMI); diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index be2e14fd4623..3680e02da576 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -114,18 +114,18 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { assert(Val == 0 || Val == -1); BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) - .addOperand(Dst) - .addImm(Val); + .add(Dst) + .addImm(Val); MI.eraseFromParent(); continue; } } BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) - .addOperand(Dst) - .addImm(0) - .addImm(-1) - .addOperand(Src); + .add(Dst) + .addImm(0) + .addImm(-1) + .add(Src); MI.eraseFromParent(); } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && SrcRC == &AMDGPU::VReg_1RegClass) { @@ -140,14 +140,14 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { MRI.getRegClass(DefInst->getOperand(3).getReg()), &AMDGPU::SGPR_64RegClass)) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64)) - .addOperand(Dst) - .addReg(AMDGPU::EXEC) - .addOperand(DefInst->getOperand(3)); + .add(Dst) + .addReg(AMDGPU::EXEC) + .add(DefInst->getOperand(3)); } else { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64)) - .addOperand(Dst) - .addOperand(Src) - .addImm(0); + .add(Dst) + .add(Src) + .addImm(0); } MI.eraseFromParent(); } diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index ecd46b95ca6f..8e612d2ddfda 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -20,12 +20,6 @@ using namespace llvm; -static cl::opt<bool> EnableSpillSGPRToVGPR( - "amdgpu-spill-sgpr-to-vgpr", - cl::desc("Enable spilling VGPRs to SGPRs"), - cl::ReallyHidden, - cl::init(true)); - SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), @@ -47,13 +41,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), PSInputAddr(0), + PSInputEnable(0), ReturnsVoid(true), FlatWorkGroupSizes(0, 0), WavesPerEU(0, 0), DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}), DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}), LDSWaveSpillSize(0), - PSInputEna(0), NumUserSGPRs(0), NumSystemSGPRs(0), HasSpilledSGPRs(false), @@ -81,34 +75,48 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) PrivateMemoryInputPtr(false) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const Function *F = MF.getFunction(); + FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); + WavesPerEU = ST.getWavesPerEU(*F); - PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); + // Non-entry functions have no special inputs for now. + // TODO: Return early for non-entry CCs. - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + CallingConv::ID CC = F->getCallingConv(); + if (CC == CallingConv::AMDGPU_PS) + PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); - if (!AMDGPU::isShader(F->getCallingConv())) { + if (AMDGPU::isKernel(CC)) { KernargSegmentPtr = true; WorkGroupIDX = true; WorkItemIDX = true; } - if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue()) + if (ST.debuggerEmitPrologue()) { + // Enable everything. WorkGroupIDY = true; - - if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue()) WorkGroupIDZ = true; - - if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue()) WorkItemIDY = true; - - if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue()) WorkItemIDZ = true; + } else { + if (F->hasFnAttribute("amdgpu-work-group-id-y")) + WorkGroupIDY = true; + + if (F->hasFnAttribute("amdgpu-work-group-id-z")) + WorkGroupIDZ = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-y")) + WorkItemIDY = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-z")) + WorkItemIDZ = true; + } // X, XY, and XYZ are the only supported combinations, so make sure Y is // enabled if Z is. if (WorkItemIDZ) WorkItemIDY = true; + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); bool MaySpill = ST.isVGPRSpillingEnabled(*F); bool HasStackObjects = FrameInfo.hasStackObjects(); @@ -135,12 +143,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) // We don't need to worry about accessing spills with flat instructions. // TODO: On VI where we must use flat for global, we should be able to omit // this if it is never used for generic access. - if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS && - ST.isAmdHsaOS()) + if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS()) FlatScratchInit = true; - - FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); - WavesPerEU = ST.getWavesPerEU(*F); } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( @@ -193,45 +197,60 @@ unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) { return PrivateMemoryPtrUserSGPR; } -SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( - MachineFunction *MF, - unsigned FrameIndex, - unsigned SubIdx) { - if (!EnableSpillSGPRToVGPR) - return SpilledReg(); - - const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - int64_t Offset = FrameInfo.getObjectOffset(FrameIndex); - Offset += SubIdx * 4; - - unsigned LaneVGPRIdx = Offset / (64 * 4); - unsigned Lane = (Offset / 4) % 64; - - struct SpilledReg Spill; - Spill.Lane = Lane; - - if (!LaneVGPRs.count(LaneVGPRIdx)) { - unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, - *MF); +/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. +bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, + int FI) { + std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI]; - if (LaneVGPR == AMDGPU::NoRegister) - // We have no VGPRs left for spilling SGPRs. - return Spill; + // This has already been allocated. + if (!SpillLanes.empty()) + return true; - LaneVGPRs[LaneVGPRIdx] = LaneVGPR; - - // Add this register as live-in to all blocks to avoid machine verifer - // complaining about use of an undefined physical register. - for (MachineFunction::iterator BI = MF->begin(), BE = MF->end(); - BI != BE; ++BI) { - BI->addLiveIn(LaneVGPR); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned WaveSize = ST.getWavefrontSize(); + + unsigned Size = FrameInfo.getObjectSize(FI); + assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size"); + assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); + + int NumLanes = Size / 4; + + // Make sure to handle the case where a wide SGPR spill may span between two + // VGPRs. + for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { + unsigned LaneVGPR; + unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize); + + if (VGPRIndex == 0) { + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (LaneVGPR == AMDGPU::NoRegister) { + // We have no VGPRs left for spilling SGPRs. Reset because we won't + // partially spill the SGPR to VGPRs. + SGPRToVGPRSpills.erase(FI); + NumVGPRSpillLanes -= I; + return false; + } + + SpillVGPRs.push_back(LaneVGPR); + + // Add this register as live-in to all blocks to avoid machine verifer + // complaining about use of an undefined physical register. + for (MachineBasicBlock &BB : MF) + BB.addLiveIn(LaneVGPR); + } else { + LaneVGPR = SpillVGPRs.back(); } + + SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex)); } - Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; - return Spill; + return true; +} + +void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) { + for (auto &R : SGPRToVGPRSpills) + MFI.RemoveStackObject(R.first); } diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 6fc8d18bceba..a84f3e274f82 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -16,13 +16,17 @@ #include "AMDGPUMachineFunction.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" #include <array> +#include <cassert> #include <map> +#include <utility> namespace llvm { -class MachineRegisterInfo; - class AMDGPUImagePseudoSourceValue : public PseudoSourceValue { public: explicit AMDGPUImagePseudoSourceValue() : @@ -109,6 +113,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // Graphics info. unsigned PSInputAddr; + unsigned PSInputEnable; + bool ReturnsVoid; // A pair of default/requested minimum/maximum flat work group sizes. @@ -130,8 +136,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { public: // FIXME: Make private unsigned LDSWaveSpillSize; - unsigned PSInputEna; - std::map<unsigned, unsigned> LaneVGPRs; unsigned ScratchOffsetReg; unsigned NumUserSGPRs; unsigned NumSystemSGPRs; @@ -182,19 +186,39 @@ private: public: struct SpilledReg { - unsigned VGPR; - int Lane; + unsigned VGPR = AMDGPU::NoRegister; + int Lane = -1; + + SpilledReg() = default; SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { } - SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { } + bool hasLane() { return Lane != -1;} bool hasReg() { return VGPR != AMDGPU::NoRegister;} }; - // SIMachineFunctionInfo definition +private: + // SGPR->VGPR spilling support. + typedef std::pair<unsigned, unsigned> SpillRegMask; + + // Track VGPR + wave index for each subregister of the SGPR spilled to + // frameindex key. + DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills; + unsigned NumVGPRSpillLanes = 0; + SmallVector<unsigned, 2> SpillVGPRs; + +public: SIMachineFunctionInfo(const MachineFunction &MF); - SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, - unsigned SubIdx); + + ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const { + auto I = SGPRToVGPRSpills.find(FrameIndex); + return (I == SGPRToVGPRSpills.end()) ? + ArrayRef<SpilledReg>() : makeArrayRef(I->second); + } + + bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); + void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); + bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } @@ -399,6 +423,10 @@ public: return PSInputAddr; } + unsigned getPSInputEnable() const { + return PSInputEnable; + } + bool isPSInputAllocated(unsigned Index) const { return PSInputAddr & (1 << Index); } @@ -407,6 +435,10 @@ public: PSInputAddr |= 1 << Index; } + void markPSInputEnabled(unsigned Index) { + PSInputEnable |= 1 << Index; + } + bool returnsVoid() const { return ReturnsVoid; } @@ -512,6 +544,6 @@ public: } }; -} // End namespace llvm +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index da86bbf9dd2a..9d4e677400e6 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -539,21 +539,30 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { Preds.push_back(Pred); assert(none_of(Succs, - [=](SIScheduleBlock *S) { return PredID == S->getID(); }) && + [=](std::pair<SIScheduleBlock*, + SIScheduleBlockLinkKind> S) { + return PredID == S.first->getID(); + }) && "Loop in the Block Graph!"); } -void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { +void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, + SIScheduleBlockLinkKind Kind) { unsigned SuccID = Succ->getID(); // Check if not already predecessor. - for (SIScheduleBlock* S : Succs) { - if (SuccID == S->getID()) + for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { + if (SuccID == S.first->getID()) { + if (S.second == SIScheduleBlockLinkKind::NoData && + Kind == SIScheduleBlockLinkKind::Data) + S.second = Kind; return; + } } if (Succ->isHighLatencyBlock()) ++NumHighLatencySuccessors; - Succs.push_back(Succ); + Succs.push_back(std::make_pair(Succ, Kind)); + assert(none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!"); @@ -573,8 +582,10 @@ void SIScheduleBlock::printDebug(bool full) { } dbgs() << "\nSuccessors:\n"; - for (SIScheduleBlock* S : Succs) { - S->printDebug(false); + for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { + if (S.second == SIScheduleBlockLinkKind::Data) + dbgs() << "(Data Dep) "; + S.first->printDebug(false); } if (Scheduled) { @@ -651,11 +662,21 @@ void SIScheduleBlockCreator::colorHighLatenciesAlone() { } } +static bool +hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { + for (const auto &PredDep : SU.Preds) { + if (PredDep.getSUnit() == &FromSU && + PredDep.getKind() == llvm::SDep::Data) + return true; + } + return false; +} + void SIScheduleBlockCreator::colorHighLatenciesGroups() { unsigned DAGSize = DAG->SUnits.size(); unsigned NumHighLatencies = 0; unsigned GroupSize; - unsigned Color = NextReservedID; + int Color = NextReservedID; unsigned Count = 0; std::set<unsigned> FormingGroup; @@ -675,35 +696,102 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() { else GroupSize = 4; - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; - if (DAG->IsHighLatencySU[SU->NodeNum]) { + for (unsigned SUNum : DAG->TopDownIndex2SU) { + const SUnit &SU = DAG->SUnits[SUNum]; + if (DAG->IsHighLatencySU[SU.NodeNum]) { unsigned CompatibleGroup = true; - unsigned ProposedColor = Color; + int ProposedColor = Color; + std::vector<int> AdditionalElements; + + // We don't want to put in the same block + // two high latency instructions that depend + // on each other. + // One way would be to check canAddEdge + // in both directions, but that currently is not + // enough because there the high latency order is + // enforced (via links). + // Instead, look at the dependencies between the + // high latency instructions and deduce if it is + // a data dependency or not. for (unsigned j : FormingGroup) { - // TODO: Currently CompatibleGroup will always be false, - // because the graph enforces the load order. This - // can be fixed, but as keeping the load order is often - // good for performance that causes a performance hit (both - // the default scheduler and this scheduler). - // When this scheduler determines a good load order, - // this can be fixed. - if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) || - !DAG->canAddEdge(&DAG->SUnits[j], SU)) + bool HasSubGraph; + std::vector<int> SubGraph; + // By construction (topological order), if SU and + // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary + // in the parent graph of SU. +#ifndef NDEBUG + SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], + HasSubGraph); + assert(!HasSubGraph); +#endif + SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, + HasSubGraph); + if (!HasSubGraph) + continue; // No dependencies between each other + else if (SubGraph.size() > 5) { + // Too many elements would be required to be added to the block. CompatibleGroup = false; + break; + } + else { + // Check the type of dependency + for (unsigned k : SubGraph) { + // If in the path to join the two instructions, + // there is another high latency instruction, + // or instructions colored for another block + // abort the merge. + if (DAG->IsHighLatencySU[k] || + (CurrentColoring[k] != ProposedColor && + CurrentColoring[k] != 0)) { + CompatibleGroup = false; + break; + } + // If one of the SU in the subgraph depends on the result of SU j, + // there'll be a data dependency. + if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { + CompatibleGroup = false; + break; + } + } + if (!CompatibleGroup) + break; + // Same check for the SU + if (hasDataDependencyPred(SU, DAG->SUnits[j])) { + CompatibleGroup = false; + break; + } + // Add all the required instructions to the block + // These cannot live in another block (because they + // depend (order dependency) on one of the + // instruction in the block, and are required for the + // high latency instruction we add. + AdditionalElements.insert(AdditionalElements.end(), + SubGraph.begin(), SubGraph.end()); + } } - if (!CompatibleGroup || ++Count == GroupSize) { + if (CompatibleGroup) { + FormingGroup.insert(SU.NodeNum); + for (unsigned j : AdditionalElements) + CurrentColoring[j] = ProposedColor; + CurrentColoring[SU.NodeNum] = ProposedColor; + ++Count; + } + // Found one incompatible instruction, + // or has filled a big enough group. + // -> start a new one. + if (!CompatibleGroup) { FormingGroup.clear(); Color = ++NextReservedID; - if (!CompatibleGroup) { - ProposedColor = Color; - FormingGroup.insert(SU->NodeNum); - } + ProposedColor = Color; + FormingGroup.insert(SU.NodeNum); + CurrentColoring[SU.NodeNum] = ProposedColor; + Count = 0; + } else if (Count == GroupSize) { + FormingGroup.clear(); + Color = ++NextReservedID; + ProposedColor = Color; Count = 0; - } else { - FormingGroup.insert(SU->NodeNum); } - CurrentColoring[SU->NodeNum] = ProposedColor; } } } @@ -835,6 +923,17 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { unsigned DAGSize = DAG->SUnits.size(); std::vector<int> PendingColoring = CurrentColoring; + assert(DAGSize >= 1 && + CurrentBottomUpReservedDependencyColoring.size() == DAGSize && + CurrentTopDownReservedDependencyColoring.size() == DAGSize); + // If there is no reserved block at all, do nothing. We don't want + // everything in one block. + if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), + CurrentBottomUpReservedDependencyColoring.end()) == 0 && + *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), + CurrentTopDownReservedDependencyColoring.end()) == 0) + return; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { SUnit *SU = &DAG->SUnits[SUNum]; std::set<unsigned> SUColors; @@ -856,6 +955,9 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { SUColors.insert(CurrentColoring[Succ->NodeNum]); SUColorsPending.insert(PendingColoring[Succ->NodeNum]); } + // If there is only one child/parent block, and that block + // is not among the ones we are removing in this path, then + // merge the instruction to that block if (SUColors.size() == 1 && SUColorsPending.size() == 1) PendingColoring[SU->NodeNum] = *SUColors.begin(); else // TODO: Attribute new colors depending on color @@ -974,12 +1076,7 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { for (unsigned SUNum : DAG->BottomUpIndex2SU) { SUnit *SU = &DAG->SUnits[SUNum]; unsigned color = CurrentColoring[SU->NodeNum]; - std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color); - if (Pos != ColorCount.end()) { - ++ColorCount[color]; - } else { - ColorCount[color] = 1; - } + ++ColorCount[color]; } for (unsigned SUNum : DAG->BottomUpIndex2SU) { @@ -1087,7 +1184,8 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) continue; if (Node2CurrentBlock[Succ->NodeNum] != SUID) - CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]); + CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], + SuccDep.isCtrl() ? NoData : Data); } for (SDep& PredDep : SU->Preds) { SUnit *Pred = PredDep.getSUnit(); @@ -1281,10 +1379,8 @@ void SIScheduleBlockCreator::fillStats() { Block->Height = 0; else { unsigned Height = 0; - for (SIScheduleBlock *Succ : Block->getSuccs()) { - if (Height < Succ->Height + 1) - Height = Succ->Height + 1; - } + for (const auto &Succ : Block->getSuccs()) + Height = std::min(Height, Succ.first->Height + 1); Block->Height = Height; } } @@ -1331,13 +1427,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, continue; int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; - std::map<unsigned, unsigned>::iterator RegPos = - LiveOutRegsNumUsages[PredID].find(Reg); - if (RegPos != LiveOutRegsNumUsages[PredID].end()) { - ++LiveOutRegsNumUsages[PredID][Reg]; - } else { - LiveOutRegsNumUsages[PredID][Reg] = 1; - } + ++LiveOutRegsNumUsages[PredID][Reg]; } } @@ -1361,6 +1451,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, std::set<unsigned> InRegs = DAG->getInRegs(); addLiveRegs(InRegs); + // Increase LiveOutRegsNumUsages for blocks + // producing registers consumed in another + // scheduling region. + for (unsigned Reg : DAG->getOutRegs()) { + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + // Do reverse traversal + int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; + SIScheduleBlock *Block = Blocks[ID]; + const std::set<unsigned> &OutRegs = Block->getOutRegs(); + + if (OutRegs.find(Reg) == OutRegs.end()) + continue; + + ++LiveOutRegsNumUsages[ID][Reg]; + break; + } + } + // Fill LiveRegsConsumers for regs that were already // defined before scheduling. for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { @@ -1377,12 +1485,8 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, } } - if (!Found) { - if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end()) - LiveRegsConsumers[Reg] = 1; - else - ++LiveRegsConsumers[Reg]; - } + if (!Found) + ++LiveRegsConsumers[Reg]; } } @@ -1403,6 +1507,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, for (SIScheduleBlock* Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } + dbgs() << '\n'; ); } @@ -1464,8 +1569,8 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { VregCurrentUsage, SregCurrentUsage); if (VregCurrentUsage > maxVregUsage) maxVregUsage = VregCurrentUsage; - if (VregCurrentUsage > maxSregUsage) - maxSregUsage = VregCurrentUsage; + if (SregCurrentUsage > maxSregUsage) + maxSregUsage = SregCurrentUsage; DEBUG( dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; @@ -1556,17 +1661,13 @@ void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, } void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { - for (SIScheduleBlock* Block : Parent->getSuccs()) { - --BlockNumPredsLeft[Block->getID()]; - if (BlockNumPredsLeft[Block->getID()] == 0) { - ReadyBlocks.push_back(Block); - } - // TODO: Improve check. When the dependency between the high latency - // instructions and the instructions of the other blocks are WAR or WAW - // there will be no wait triggered. We would like these cases to not - // update LastPosHighLatencyParentScheduled. - if (Parent->isHighLatencyBlock()) - LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled; + for (const auto &Block : Parent->getSuccs()) { + if (--BlockNumPredsLeft[Block.first->getID()] == 0) + ReadyBlocks.push_back(Block.first); + + if (Parent->isHighLatencyBlock() && + Block.second == SIScheduleBlockLinkKind::Data) + LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; } } @@ -1578,12 +1679,10 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { LiveOutRegsNumUsages[Block->getID()].begin(), E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { std::pair<unsigned, unsigned> RegP = *RegI; - if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end()) - LiveRegsConsumers[RegP.first] = RegP.second; - else { - assert(LiveRegsConsumers[RegP.first] == 0); - LiveRegsConsumers[RegP.first] += RegP.second; - } + // We produce this register, thus it must not be previously alive. + assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || + LiveRegsConsumers[RegP.first] == 0); + LiveRegsConsumers[RegP.first] += RegP.second; } if (LastPosHighLatencyParentScheduled[Block->getID()] > (unsigned)LastPosWaitedHighLatency) @@ -1825,7 +1924,9 @@ void SIScheduleDAGMI::schedule() // if VGPR usage is extremely high, try other good performing variants // which could lead to lower VGPR usage if (Best.MaxVGPRUsage > 180) { - std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { + static const std::pair<SISchedulerBlockCreatorVariant, + SISchedulerBlockSchedulerVariant> + Variants[] = { { LatenciesAlone, BlockRegUsageLatency }, // { LatenciesAlone, BlockRegUsage }, { LatenciesGrouped, BlockLatencyRegUsage }, @@ -1844,7 +1945,9 @@ void SIScheduleDAGMI::schedule() // if VGPR usage is still extremely high, we may spill. Try other variants // which are less performing, but that could lead to lower VGPR usage. if (Best.MaxVGPRUsage > 200) { - std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { + static const std::pair<SISchedulerBlockCreatorVariant, + SISchedulerBlockSchedulerVariant> + Variants[] = { // { LatenciesAlone, BlockRegUsageLatency }, { LatenciesAlone, BlockRegUsage }, // { LatenciesGrouped, BlockLatencyRegUsage }, diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h index 77c07350d325..122d0f67ca8c 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/lib/Target/AMDGPU/SIMachineScheduler.h @@ -40,13 +40,12 @@ enum SIScheduleCandReason { struct SISchedulerCandidate { // The reason for this candidate. - SIScheduleCandReason Reason; + SIScheduleCandReason Reason = NoCand; // Set of reasons that apply to multiple candidates. - uint32_t RepeatReasonSet; + uint32_t RepeatReasonSet = 0; - SISchedulerCandidate() - : Reason(NoCand), RepeatReasonSet(0) {} + SISchedulerCandidate() = default; bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); } void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); } @@ -55,6 +54,11 @@ struct SISchedulerCandidate { class SIScheduleDAGMI; class SIScheduleBlockCreator; +enum SIScheduleBlockLinkKind { + NoData, + Data +}; + class SIScheduleBlock { SIScheduleDAGMI *DAG; SIScheduleBlockCreator *BC; @@ -84,8 +88,8 @@ class SIScheduleBlock { std::set<unsigned> LiveInRegs; std::set<unsigned> LiveOutRegs; - bool Scheduled; - bool HighLatencyBlock; + bool Scheduled = false; + bool HighLatencyBlock = false; std::vector<unsigned> HasLowLatencyNonWaitedParent; @@ -93,14 +97,14 @@ class SIScheduleBlock { unsigned ID; std::vector<SIScheduleBlock*> Preds; // All blocks predecessors. - std::vector<SIScheduleBlock*> Succs; // All blocks successors. - unsigned NumHighLatencySuccessors; + // All blocks successors, and the kind of link + std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs; + unsigned NumHighLatencySuccessors = 0; public: SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC, unsigned ID): - DAG(DAG), BC(BC), TopRPTracker(TopPressure), Scheduled(false), - HighLatencyBlock(false), ID(ID), NumHighLatencySuccessors(0) {} + DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {} ~SIScheduleBlock() = default; @@ -114,10 +118,11 @@ public: // Add block pred, which has instruction predecessor of SU. void addPred(SIScheduleBlock *Pred); - void addSucc(SIScheduleBlock *Succ); + void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind); const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; } - const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; } + ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> + getSuccs() const { return Succs; } unsigned Height; // Maximum topdown path length to block without outputs unsigned Depth; // Maximum bottomup path length to block without inputs @@ -213,9 +218,9 @@ struct SIScheduleBlocks { }; enum SISchedulerBlockCreatorVariant { - LatenciesAlone, - LatenciesGrouped, - LatenciesAlonePlusConsecutive + LatenciesAlone, + LatenciesGrouped, + LatenciesAlonePlusConsecutive }; class SIScheduleBlockCreator { @@ -451,6 +456,7 @@ public: LiveIntervals *getLIS() { return LIS; } MachineRegisterInfo *getMRI() { return &MRI; } const TargetRegisterInfo *getTRI() { return TRI; } + ScheduleDAGTopologicalSort *GetTopo() { return &Topo; } SUnit& getEntrySU() { return EntrySU; } SUnit& getExitSU() { return ExitSU; } @@ -469,6 +475,14 @@ public: return InRegs; } + std::set<unsigned> getOutRegs() { + std::set<unsigned> OutRegs; + for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { + OutRegs.insert(RegMaskPair.RegUnit); + } + return OutRegs; + }; + unsigned getVGPRSetID() const { return VGPRSetID; } unsigned getSGPRSetID() const { return SGPRSetID; } diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp new file mode 100644 index 000000000000..e02c2e3240e8 --- /dev/null +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -0,0 +1,713 @@ +//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass tries to apply several peephole SDWA patterns. +/// +/// E.g. original: +/// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1 +/// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3 +/// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2 +/// +/// Replace: +/// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3 +/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +/// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include <unordered_map> + +using namespace llvm; + +#define DEBUG_TYPE "si-peephole-sdwa" + +STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); +STATISTIC(NumSDWAInstructionsPeepholed, + "Number of instruction converted to SDWA."); + +namespace { + +class SDWAOperand; + +class SIPeepholeSDWA : public MachineFunctionPass { +private: + MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + + std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; + + Optional<int64_t> foldToImm(const MachineOperand &Op) const; + +public: + static char ID; + + typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector; + + SIPeepholeSDWA() : MachineFunctionPass(ID) { + initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void matchSDWAOperands(MachineFunction &MF); + bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); + + StringRef getPassName() const override { return "SI Peephole SDWA"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +class SDWAOperand { +private: + MachineOperand *Target; // Operand that would be used in converted instruction + MachineOperand *Replaced; // Operand that would be replace by Target + +public: + SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) + : Target(TargetOp), Replaced(ReplacedOp) { + assert(Target->isReg()); + assert(Replaced->isReg()); + } + + virtual ~SDWAOperand() {} + + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; + virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; + + MachineOperand *getTargetOperand() const { return Target; } + MachineOperand *getReplacedOperand() const { return Replaced; } + MachineInstr *getParentInst() const { return Target->getParent(); } + MachineRegisterInfo *getMRI() const { + return &getParentInst()->getParent()->getParent()->getRegInfo(); + } +}; + +using namespace AMDGPU::SDWA; + +class SDWASrcOperand : public SDWAOperand { +private: + SdwaSel SrcSel; + bool Abs; + bool Neg; + bool Sext; + +public: + SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, + SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, + bool Sext_ = false) + : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), + Neg(Neg_), Sext(Sext_) {} + + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + SdwaSel getSrcSel() const { return SrcSel; } + bool getAbs() const { return Abs; } + bool getNeg() const { return Neg; } + bool getSext() const { return Sext; } + + uint64_t getSrcMods() const; +}; + +class SDWADstOperand : public SDWAOperand { +private: + SdwaSel DstSel; + DstUnused DstUn; + +public: + SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, + SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) + : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} + + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + SdwaSel getDstSel() const { return DstSel; } + DstUnused getDstUnused() const { return DstUn; } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) + +char SIPeepholeSDWA::ID = 0; + +char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; + +FunctionPass *llvm::createSIPeepholeSDWAPass() { + return new SIPeepholeSDWA(); +} + +#ifndef NDEBUG + +static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) { + switch(Sel) { + case BYTE_0: OS << "BYTE_0"; break; + case BYTE_1: OS << "BYTE_1"; break; + case BYTE_2: OS << "BYTE_2"; break; + case BYTE_3: OS << "BYTE_3"; break; + case WORD_0: OS << "WORD_0"; break; + case WORD_1: OS << "WORD_1"; break; + case DWORD: OS << "DWORD"; break; + } + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { + switch(Un) { + case UNUSED_PAD: OS << "UNUSED_PAD"; break; + case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; + case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; + } + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) { + OS << "SDWA src: " << *Src.getTargetOperand() + << " src_sel:" << Src.getSrcSel() + << " abs:" << Src.getAbs() << " neg:" << Src.getNeg() + << " sext:" << Src.getSext() << '\n'; + return OS; +} + +static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) { + OS << "SDWA dst: " << *Dst.getTargetOperand() + << " dst_sel:" << Dst.getDstSel() + << " dst_unused:" << Dst.getDstUnused() << '\n'; + return OS; +} + +#endif + +static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { + assert(To.isReg() && From.isReg()); + To.setReg(From.getReg()); + To.setSubReg(From.getSubReg()); + To.setIsUndef(From.isUndef()); + if (To.isUse()) { + To.setIsKill(From.isKill()); + } else { + To.setIsDead(From.isDead()); + } +} + +static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { + return LHS.isReg() && + RHS.isReg() && + LHS.getReg() == RHS.getReg() && + LHS.getSubReg() == RHS.getSubReg(); +} + +static bool isSubregOf(const MachineOperand &SubReg, + const MachineOperand &SuperReg, + const TargetRegisterInfo *TRI) { + + if (!SuperReg.isReg() || !SubReg.isReg()) + return false; + + if (isSameReg(SuperReg, SubReg)) + return true; + + if (SuperReg.getReg() != SubReg.getReg()) + return false; + + LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()); + LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg()); + SuperMask |= ~SubMask; + return SuperMask.all(); +} + +uint64_t SDWASrcOperand::getSrcMods() const { + uint64_t Mods = 0; + if (Abs || Neg) { + assert(!Sext && + "Float and integer src modifiers can't be set simulteniously"); + Mods |= Abs ? SISrcMods::ABS : 0; + Mods |= Neg ? SISrcMods::NEG : 0; + } else if (Sext) { + Mods |= SISrcMods::SEXT; + } + + return Mods; +} + +MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { + // For SDWA src operand potential instruction is one that use register + // defined by parent instruction + MachineRegisterInfo *MRI = getMRI(); + MachineOperand *Replaced = getReplacedOperand(); + assert(Replaced->isReg()); + + MachineInstr *PotentialMI = nullptr; + for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) { + // If this is use of another subreg of dst reg then do nothing + if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) + continue; + + // If there exist use of superreg of dst then we should not combine this + // opernad + if (!isSameReg(PotentialMO, *Replaced)) + return nullptr; + + // Check that PotentialMI is only instruction that uses dst reg + if (PotentialMI == nullptr) { + PotentialMI = PotentialMO.getParent(); + } else if (PotentialMI != PotentialMO.getParent()) { + return nullptr; + } + } + + return PotentialMI; +} + +bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { + // Find operand in instruction that matches source operand and replace it with + // target operand. Set corresponding src_sel + + MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); + MachineOperand *SrcMods = + TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); + assert(Src && Src->isReg()); + if (!isSameReg(*Src, *getReplacedOperand())) { + // If this is not src0 then it should be src1 + Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); + SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + + assert(Src && Src->isReg()); + + if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && + !isSameReg(*Src, *getReplacedOperand())) { + // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to + // src2. This is not allowed. + return false; + } + + assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods); + } + copyRegOperand(*Src, *getTargetOperand()); + SrcSel->setImm(getSrcSel()); + SrcMods->setImm(getSrcMods()); + getTargetOperand()->setIsKill(false); + return true; +} + +MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { + // For SDWA dst operand potential instruction is one that defines register + // that this operand uses + MachineRegisterInfo *MRI = getMRI(); + MachineInstr *ParentMI = getParentInst(); + MachineOperand *Replaced = getReplacedOperand(); + assert(Replaced->isReg()); + + for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) { + if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) + continue; + + if (!isSameReg(*Replaced, PotentialMO)) + return nullptr; + + // Check that ParentMI is the only instruction that uses replaced register + for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) { + if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) && + UseMO.getParent() != ParentMI) { + return nullptr; + } + } + + // Due to SSA this should be onle def of replaced register, so return it + return PotentialMO.getParent(); + } + + return nullptr; +} + +bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { + // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused + + if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && + getDstSel() != AMDGPU::SDWA::DWORD) { + // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD + return false; + } + + MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + assert(Operand && + Operand->isReg() && + isSameReg(*Operand, *getReplacedOperand())); + copyRegOperand(*Operand, *getTargetOperand()); + MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); + assert(DstSel); + DstSel->setImm(getDstSel()); + MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); + assert(DstUnused); + DstUnused->setImm(getDstUnused()); + + // Remove original instruction because it would conflict with our new + // instruction by register definition + getParentInst()->eraseFromParent(); + return true; +} + +Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { + if (Op.isImm()) { + return Op.getImm(); + } + + // If this is not immediate then it can be copy of immediate value, e.g.: + // %vreg1<def> = S_MOV_B32 255; + if (Op.isReg()) { + for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { + if (!isSameReg(Op, Def)) + continue; + + const MachineInstr *DefInst = Def.getParent(); + if (!TII->isFoldableCopy(*DefInst)) + return None; + + const MachineOperand &Copied = DefInst->getOperand(1); + if (!Copied.isImm()) + return None; + + return Copied.getImm(); + } + } + + return None; +} + +void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_LSHRREV_B32_e32: + case AMDGPU::V_ASHRREV_I32_e32: + case AMDGPU::V_LSHLREV_B32_e32: { + // from: v_lshrrev_b32_e32 v1, 16/24, v0 + // to SDWA src:v0 src_sel:WORD_1/BYTE_3 + + // from: v_ashrrev_i32_e32 v1, 16/24, v0 + // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 + + // from: v_lshlrev_b32_e32 v1, 16/24, v0 + // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + auto Imm = foldToImm(*Src0); + if (!Imm) + break; + + if (*Imm != 16 && *Imm != 24) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + if (Opcode == AMDGPU::V_LSHLREV_B32_e32) { + auto SDWADst = make_unique<SDWADstOperand>( + Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); + SDWAOperands[&MI] = std::move(SDWADst); + ++NumSDWAPatternsFound; + } else { + auto SDWASrc = make_unique<SDWASrcOperand>( + Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, + Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + } + break; + } + + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_LSHLREV_B16_e32: { + // from: v_lshrrev_b16_e32 v1, 8, v0 + // to SDWA src:v0 src_sel:BYTE_1 + + // from: v_ashrrev_i16_e32 v1, 8, v0 + // to SDWA src:v0 src_sel:BYTE_1 sext:1 + + // from: v_lshlrev_b16_e32 v1, 8, v0 + // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + auto Imm = foldToImm(*Src0); + if (!Imm || *Imm != 8) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + if (Opcode == AMDGPU::V_LSHLREV_B16_e32) { + auto SDWADst = + make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); + SDWAOperands[&MI] = std::move(SDWADst); + ++NumSDWAPatternsFound; + } else { + auto SDWASrc = make_unique<SDWASrcOperand>( + Src1, Dst, BYTE_1, false, false, + Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + } + break; + } + + case AMDGPU::V_BFE_I32: + case AMDGPU::V_BFE_U32: { + // e.g.: + // from: v_bfe_u32 v1, v0, 8, 8 + // to SDWA src:v0 src_sel:BYTE_1 + + // offset | width | src_sel + // ------------------------ + // 0 | 8 | BYTE_0 + // 0 | 16 | WORD_0 + // 0 | 32 | DWORD ? + // 8 | 8 | BYTE_1 + // 16 | 8 | BYTE_2 + // 16 | 16 | WORD_1 + // 24 | 8 | BYTE_3 + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + auto Offset = foldToImm(*Src1); + if (!Offset) + break; + + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + auto Width = foldToImm(*Src2); + if (!Width) + break; + + SdwaSel SrcSel = DWORD; + + if (*Offset == 0 && *Width == 8) + SrcSel = BYTE_0; + else if (*Offset == 0 && *Width == 16) + SrcSel = WORD_0; + else if (*Offset == 0 && *Width == 32) + SrcSel = DWORD; + else if (*Offset == 8 && *Width == 8) + SrcSel = BYTE_1; + else if (*Offset == 16 && *Width == 8) + SrcSel = BYTE_2; + else if (*Offset == 16 && *Width == 16) + SrcSel = WORD_1; + else if (*Offset == 24 && *Width == 8) + SrcSel = BYTE_3; + else + break; + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src0->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + auto SDWASrc = make_unique<SDWASrcOperand>( + Src0, Dst, SrcSel, false, false, + Opcode == AMDGPU::V_BFE_U32 ? false : true); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + break; + } + case AMDGPU::V_AND_B32_e32: { + // e.g.: + // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 + // to SDWA src:v0 src_sel:WORD_0/BYTE_0 + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + auto Imm = foldToImm(*Src0); + if (!Imm) + break; + + if (*Imm != 0x0000ffff && *Imm != 0x000000ff) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + auto SDWASrc = make_unique<SDWASrcOperand>( + Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); + DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + break; + } + } + } + } +} + +bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, + const SDWAOperandsVector &SDWAOperands) { + // Check if this instruction can be converted to SDWA: + // 1. Does this opcode support SDWA + if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1) + return false; + + // 2. Are all operands - VGPRs + for (const MachineOperand &Operand : MI.explicit_operands()) { + if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg())) + return false; + } + + // Convert to sdwa + int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); + assert(SDWAOpcode != -1); + + const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); + + // Create SDWA version of instruction MI and initialize its operands + MachineInstrBuilder SDWAInst = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); + + // Copy dst, if it is present in original then should also be present in SDWA + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (Dst) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); + SDWAInst.add(*Dst); + } else { + assert(TII->isVOPC(MI)); + } + + // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and + // src0_modifiers (except for v_nop_sdwa, but it can't get here) + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + assert( + Src0 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); + SDWAInst.addImm(0); + SDWAInst.add(*Src0); + + // Copy src1 if present, initialize src1_modifiers. + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) { + assert( + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); + SDWAInst.addImm(0); + SDWAInst.add(*Src1); + } else { + assert(TII->isVOP1(MI)); + } + + if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || + SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { + // v_mac_f16/32 has additional src2 operand tied to vdst + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + assert(Src2); + SDWAInst.add(*Src2); + } + + // Initialize clamp. + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); + SDWAInst.addImm(0); + + // Initialize dst_sel and dst_unused if present + if (Dst) { + assert( + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); + } + + // Initialize src0_sel + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + + + // Initialize src1_sel if present + if (Src1) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } + + // Apply all sdwa operand pattenrs + bool Converted = false; + for (auto &Operand : SDWAOperands) { + Converted |= Operand->convertToSDWA(*SDWAInst, TII); + } + if (!Converted) { + SDWAInst->eraseFromParent(); + return false; + } + + DEBUG(dbgs() << "Convert instruction:" << MI + << "Into:" << *SDWAInst << '\n'); + ++NumSDWAInstructionsPeepholed; + + MI.eraseFromParent(); + return true; +} + +bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + + if (!ST.hasSDWA() || + !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9 + return false; + } + + MRI = &MF.getRegInfo(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); + + std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; + + matchSDWAOperands(MF); + + for (auto &OperandPair : SDWAOperands) { + auto &Operand = OperandPair.second; + MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + if (PotentialMI) { + PotentialMatches[PotentialMI].push_back(std::move(Operand)); + } + } + + for (auto &PotentialPair : PotentialMatches) { + MachineInstr &PotentialMI = *PotentialPair.first; + convertToSDWA(PotentialMI, PotentialPair.second); + } + + SDWAOperands.clear(); + return false; +} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index a1ed5e8441df..36d4df52ff0e 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -24,12 +24,6 @@ using namespace llvm; -static cl::opt<bool> EnableSpillSGPRToSMEM( - "amdgpu-spill-sgpr-to-smem", - cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), - cl::init(false)); - - static bool hasPressureSet(const int *PSets, unsigned PSetID) { for (unsigned i = 0; PSets[i] != -1; ++i) { if (PSets[i] == (int)PSetID) @@ -49,9 +43,28 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, } } -SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(), - SGPRPressureSets(getNumRegPressureSets()), - VGPRPressureSets(getNumRegPressureSets()) { +static cl::opt<bool> EnableSpillSGPRToSMEM( + "amdgpu-spill-sgpr-to-smem", + cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"), + cl::init(false)); + +static cl::opt<bool> EnableSpillSGPRToVGPR( + "amdgpu-spill-sgpr-to-vgpr", + cl::desc("Enable spilling VGPRs to SGPRs"), + cl::ReallyHidden, + cl::init(true)); + +SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) : + AMDGPURegisterInfo(), + SGPRPressureSets(getNumRegPressureSets()), + VGPRPressureSets(getNumRegPressureSets()), + SpillSGPRToVGPR(false), + SpillSGPRToSMEM(false) { + if (EnableSpillSGPRToSMEM && ST.hasScalarStores()) + SpillSGPRToSMEM = true; + else if (EnableSpillSGPRToVGPR) + SpillSGPRToVGPR = true; + unsigned NumRegPressureSets = getNumRegPressureSets(); SGPRSetID = NumRegPressureSets; @@ -97,14 +110,18 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4; + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { - unsigned RegCount = getMaxNumSGPRs(MF); + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + unsigned RegCount = ST.getMaxNumSGPRs(MF); unsigned Reg; // Try to place it in a hole after PrivateSegmentbufferReg. @@ -129,6 +146,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); + // Reserve the memory aperture registers. + reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); + reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); + reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); + reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); + // Reserve Trap Handler registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::TBA); reserveRegisterTuples(Reserved, AMDGPU::TMA); @@ -139,14 +162,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); - unsigned MaxNumSGPRs = getMaxNumSGPRs(MF); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + + unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); reserveRegisterTuples(Reserved, Reg); } - unsigned MaxNumVGPRs = getMaxNumVGPRs(MF); + unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); @@ -253,7 +278,6 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, } MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -263,8 +287,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) .addFrameIndex(FrameIdx); - BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg) - .addReg(UnusedCarry, RegState::Define | RegState::Dead) + TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) .addReg(OffsetReg, RegState::Kill) .addReg(FIReg); } @@ -415,14 +438,14 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg(); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(Reg, getDefRegState(!IsStore)) - .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) - .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + .addReg(Reg, getDefRegState(!IsStore)) + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); return true; } @@ -545,11 +568,20 @@ static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; } -void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, +bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, - RegScavenger *RS) const { + RegScavenger *RS, + bool OnlyToVGPR) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + + ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills + = MFI->getSGPRToVGPRSpills(Index); + bool SpillToVGPR = !VGPRSpills.empty(); + if (OnlyToVGPR && !SpillToVGPR) + return false; + MachineRegisterInfo &MRI = MF->getRegInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -558,10 +590,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; + bool SpillToSMEM = spillSGPRToSMEM(); + if (SpillToSMEM && OnlyToVGPR) + return false; assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); @@ -634,9 +667,9 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, continue; } - struct SIMachineFunctionInfo::SpilledReg Spill = - MFI->getSpilledReg(MF, Index, i); - if (Spill.hasReg()) { + if (SpillToVGPR) { + SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) @@ -647,6 +680,10 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, // frame index, we should delete the frame index when all references to // it are fixed. } else { + // XXX - Can to VGPR spill fail for some subregisters but not others? + if (OnlyToVGPR) + return false; + // Spill SGPR to a frame index. // TODO: Should VI try to spill to VGPR and then spill to SMEM? unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -690,22 +727,33 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, MI->eraseFromParent(); MFI->addToSpilledSGPRs(NumSubRegs); + return true; } -void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, +bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, - RegScavenger *RS) const { + RegScavenger *RS, + bool OnlyToVGPR) const { MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + + ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills + = MFI->getSGPRToVGPRSpills(Index); + bool SpillToVGPR = !VGPRSpills.empty(); + if (OnlyToVGPR && !SpillToVGPR) + return false; + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); unsigned SuperReg = MI->getOperand(0).getReg(); - bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; + bool SpillToSMEM = spillSGPRToSMEM(); + if (SpillToSMEM && OnlyToVGPR) + return false; assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); @@ -773,10 +821,8 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, continue; } - SIMachineFunctionInfo::SpilledReg Spill - = MFI->getSpilledReg(MF, Index, i); - - if (Spill.hasReg()) { + if (SpillToVGPR) { + SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; auto MIB = BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) @@ -786,6 +832,9 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, if (NumSubRegs > 1) MIB.addReg(SuperReg, RegState::ImplicitDefine); } else { + if (OnlyToVGPR) + return false; + // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -820,6 +869,32 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, } MI->eraseFromParent(); + return true; +} + +/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to +/// a VGPR and the stack slot can be safely eliminated when all other users are +/// handled. +bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( + MachineBasicBlock::iterator MI, + int FI, + RegScavenger *RS) const { + switch (MI->getOpcode()) { + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S32_SAVE: + return spillSGPR(MI, FI, RS, true); + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_S32_RESTORE: + return restoreSGPR(MI, FI, RS, true); + default: + llvm_unreachable("not an SGPR spill instruction"); + } } void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, @@ -1156,210 +1231,6 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, return AMDGPU::NoRegister; } -unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const { - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 800; - return 512; -} - -unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const { - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 102; - return 104; -} - -unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST, - const SIMachineFunctionInfo &MFI) const { - if (MFI.hasFlatScratchInit()) { - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 6; // FLAT_SCRATCH, XNACK, VCC (in that order) - - if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) - return 4; // FLAT_SCRATCH, VCC (in that order) - } - - if (ST.isXNACKEnabled()) - return 4; // XNACK, VCC (in that order) - - return 2; // VCC. -} - -unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST, - unsigned WavesPerEU) const { - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - switch (WavesPerEU) { - case 0: return 0; - case 10: return 0; - case 9: return 0; - case 8: return 81; - default: return 97; - } - } else { - switch (WavesPerEU) { - case 0: return 0; - case 10: return 0; - case 9: return 49; - case 8: return 57; - case 7: return 65; - case 6: return 73; - case 5: return 81; - default: return 97; - } - } -} - -unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST, - unsigned WavesPerEU, - bool Addressable) const { - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - switch (WavesPerEU) { - case 0: return 80; - case 10: return 80; - case 9: return 80; - case 8: return 96; - default: return Addressable ? getNumAddressableSGPRs(ST) : 112; - } - } else { - switch (WavesPerEU) { - case 0: return 48; - case 10: return 48; - case 9: return 56; - case 8: return 64; - case 7: return 72; - case 6: return 80; - case 5: return 96; - default: return getNumAddressableSGPRs(ST); - } - } -} - -unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const { - const Function &F = *MF.getFunction(); - - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - - // Compute maximum number of SGPRs function can use using default/requested - // minimum number of waves per execution unit. - std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); - unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, false); - unsigned MaxNumAddressableSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, true); - - // Check if maximum number of SGPRs was explicitly requested using - // "amdgpu-num-sgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-sgpr")) { - unsigned Requested = AMDGPU::getIntegerAttribute( - F, "amdgpu-num-sgpr", MaxNumSGPRs); - - // Make sure requested value does not violate subtarget's specifications. - if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI))) - Requested = 0; - - // If more SGPRs are required to support the input user/system SGPRs, - // increase to accommodate them. - // - // FIXME: This really ends up using the requested number of SGPRs + number - // of reserved special registers in total. Theoretically you could re-use - // the last input registers for these special registers, but this would - // require a lot of complexity to deal with the weird aliasing. - unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs(); - if (Requested && Requested < NumInputSGPRs) - Requested = NumInputSGPRs; - - // Make sure requested value is compatible with values implied by - // default/requested minimum/maximum number of waves per execution unit. - if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first, false)) - Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second)) - Requested = 0; - - if (Requested) - MaxNumSGPRs = Requested; - } - - if (ST.hasSGPRInitBug()) - MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - - return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI), - MaxNumAddressableSGPRs); -} - -unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs( - const SISubtarget &ST) const { - if (ST.debuggerReserveRegs()) - return 4; - return 0; -} - -unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const { - switch (WavesPerEU) { - case 0: return 0; - case 10: return 0; - case 9: return 25; - case 8: return 29; - case 7: return 33; - case 6: return 37; - case 5: return 41; - case 4: return 49; - case 3: return 65; - case 2: return 85; - default: return 129; - } -} - -unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const { - switch (WavesPerEU) { - case 0: return 24; - case 10: return 24; - case 9: return 28; - case 8: return 32; - case 7: return 36; - case 6: return 40; - case 5: return 48; - case 4: return 64; - case 3: return 84; - case 2: return 128; - default: return getTotalNumVGPRs(); - } -} - -unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const { - const Function &F = *MF.getFunction(); - - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - - // Compute maximum number of VGPRs function can use using default/requested - // minimum number of waves per execution unit. - std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); - unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); - - // Check if maximum number of VGPRs was explicitly requested using - // "amdgpu-num-vgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-vgpr")) { - unsigned Requested = AMDGPU::getIntegerAttribute( - F, "amdgpu-num-vgpr", MaxNumVGPRs); - - // Make sure requested value does not violate subtarget's specifications. - if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST)) - Requested = 0; - - // Make sure requested value is compatible with values implied by - // default/requested minimum/maximum number of waves per execution unit. - if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) - Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) - Requested = 0; - - if (Requested) - MaxNumVGPRs = Requested; - } - - return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST); -} - ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { if (EltSize == 4) { @@ -1476,3 +1347,62 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const { return hasVGPRs(getRegClassForReg(MRI, Reg)); } + +bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const { + unsigned SrcSize = SrcRC->getSize(); + unsigned DstSize = DstRC->getSize(); + unsigned NewSize = NewRC->getSize(); + + // Do not increase size of registers beyond dword, we would need to allocate + // adjacent registers and constraint regalloc more than needed. + + // Always allow dword coalescing. + if (SrcSize <= 4 || DstSize <= 4) + return true; + + return NewSize <= DstSize || NewSize <= SrcSize; +} + +unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), + *MF.getFunction()); + switch (RC->getID()) { + default: + return AMDGPURegisterInfo::getRegPressureLimit(RC, MF); + case AMDGPU::VGPR_32RegClassID: + return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); + case AMDGPU::SGPR_32RegClassID: + return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); + } +} + +unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + if (Idx == getVGPRPressureSet()) + return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, + const_cast<MachineFunction &>(MF)); + + if (Idx == getSGPRPressureSet()) + return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, + const_cast<MachineFunction &>(MF)); + + return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx); +} + +const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { + static const int Empty[] = { -1 }; + + if (hasRegUnit(AMDGPU::M0, RegUnit)) + return Empty; + return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit); +} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 0bcae7d9840c..679ed229758a 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -21,8 +21,8 @@ namespace llvm { -class SISubtarget; class MachineRegisterInfo; +class SISubtarget; class SIMachineFunctionInfo; class SIRegisterInfo final : public AMDGPURegisterInfo { @@ -31,13 +31,22 @@ private: unsigned VGPRSetID; BitVector SGPRPressureSets; BitVector VGPRPressureSets; + bool SpillSGPRToVGPR; + bool SpillSGPRToSMEM; void reserveRegisterTuples(BitVector &, unsigned Reg) const; void classifyPressureSet(unsigned PSetID, unsigned Reg, BitVector &PressureSets) const; - public: - SIRegisterInfo(); + SIRegisterInfo(const SISubtarget &ST); + + bool spillSGPRToVGPR() const { + return SpillSGPRToVGPR; + } + + bool spillSGPRToSMEM() const { + return SpillSGPRToSMEM; + } /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. @@ -78,16 +87,22 @@ public: const TargetRegisterClass *getPointerRegClass( const MachineFunction &MF, unsigned Kind = 0) const override; - void spillSGPR(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS) const; + /// If \p OnlyToVGPR is true, this will only succeed if this + bool spillSGPR(MachineBasicBlock::iterator MI, + int FI, RegScavenger *RS, + bool OnlyToVGPR = false) const; - void restoreSGPR(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS) const; + bool restoreSGPR(MachineBasicBlock::iterator MI, + int FI, RegScavenger *RS, + bool OnlyToVGPR = false) const; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override; + bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, + int FI, RegScavenger *RS) const; + unsigned getHWRegIndex(unsigned Reg) const { return getEncodingValue(Reg) & 0xff; } @@ -195,74 +210,23 @@ public: return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID); } - /// \returns SGPR allocation granularity supported by the subtarget. - unsigned getSGPRAllocGranule() const { - return 8; - } - - /// \returns Total number of SGPRs supported by the subtarget. - unsigned getTotalNumSGPRs(const SISubtarget &ST) const; - - /// \returns Number of addressable SGPRs supported by the subtarget. - unsigned getNumAddressableSGPRs(const SISubtarget &ST) const; - - /// \returns Number of reserved SGPRs supported by the subtarget. - unsigned getNumReservedSGPRs(const SISubtarget &ST, - const SIMachineFunctionInfo &MFI) const; - - /// \returns Minimum number of SGPRs that meets given number of waves per - /// execution unit requirement for given subtarget. - unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const; - - /// \returns Maximum number of SGPRs that meets given number of waves per - /// execution unit requirement for given subtarget. - unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU, - bool Addressable) const; - - /// \returns Maximum number of SGPRs that meets number of waves per execution - /// unit requirement for function \p MF, or number of SGPRs explicitly - /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. - /// - /// \returns Value that meets number of waves per execution unit requirement - /// if explicitly requested value cannot be converted to integer, violates - /// subtarget's specifications, or does not meet number of waves per execution - /// unit requirement. - unsigned getMaxNumSGPRs(const MachineFunction &MF) const; - - /// \returns VGPR allocation granularity supported by the subtarget. - unsigned getVGPRAllocGranule() const { - return 4; - } - - /// \returns Total number of VGPRs supported by the subtarget. - unsigned getTotalNumVGPRs() const { - return 256; - } - - /// \returns Number of reserved VGPRs for debugger use supported by the - /// subtarget. - unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const; + ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, + unsigned EltSize) const; - /// \returns Minimum number of SGPRs that meets given number of waves per - /// execution unit requirement. - unsigned getMinNumVGPRs(unsigned WavesPerEU) const; + bool shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const override; - /// \returns Maximum number of VGPRs that meets given number of waves per - /// execution unit requirement. - unsigned getMaxNumVGPRs(unsigned WavesPerEU) const; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const override; - /// \returns Maximum number of VGPRs that meets number of waves per execution - /// unit requirement for function \p MF, or number of VGPRs explicitly - /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. - /// - /// \returns Value that meets number of waves per execution unit requirement - /// if explicitly requested value cannot be converted to integer, violates - /// subtarget's specifications, or does not meet number of waves per execution - /// unit requirement. - unsigned getMaxNumVGPRs(const MachineFunction &MF) const; + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override; - ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, - unsigned EltSize) const; + const int *getRegUnitPressureSets(unsigned RegUnit) const override; private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index 31e714b9f6b9..fc808011cd88 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -44,6 +44,11 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; +def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>; +def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>; +def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>; +def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>; + // Trap handler registers def TBA_LO : SIReg<"tba_lo", 108>; def TBA_HI : SIReg<"tba_hi", 109>; @@ -128,7 +133,7 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> { // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "SGPR%u", 0, 103))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. @@ -179,7 +184,7 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, (add (decimate (shl SGPR_32, 15), 4))]>; // Trap handler TMP 32-bit registers -def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, (add (sequence "TTMP%u", 0, 11))> { let isAllocatable = 0; } @@ -197,7 +202,8 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], (add (decimate (shl TTMP_32, 3), 4))]>; // VGPR 32-bit registers -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +// i16/f16 only on VI+ +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; @@ -258,19 +264,20 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, - TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> { + TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, + SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { let AllocationPriority = 7; } -def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { let AllocationPriority = 7; } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> { let AllocationPriority = 7; } @@ -319,7 +326,7 @@ def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> { let AllocationPriority = 11; } -def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> { +def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; let AllocationPriority = 12; @@ -366,7 +373,7 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { let Size = 32; } -def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, +def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add VGPR_32, SReg_32)> { let isAllocatable = 0; } @@ -417,6 +424,18 @@ multiclass SIRegOperand <string rc, string MatchName, string opType> { let OperandType = opType#"_FP64"; let ParserMatchClass = RegImmMatcher<MatchName#"F64">; } + + def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + let OperandType = opType#"_V2INT16"; + let ParserMatchClass = RegImmMatcher<MatchName#"V2B16">; + let DecoderMethod = "decodeOperand_VSrcV216"; + } + + def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> { + let OperandType = opType#"_V2FP16"; + let ParserMatchClass = RegImmMatcher<MatchName#"V2F16">; + let DecoderMethod = "decodeOperand_VSrcV216"; + } } } diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td index be27966fd5f1..0f02f5825cb0 100644 --- a/lib/Target/AMDGPU/SISchedule.td +++ b/lib/Target/AMDGPU/SISchedule.td @@ -53,6 +53,11 @@ class SISchedMachineModel : SchedMachineModel { let MicroOpBufferSize = 1; let IssueWidth = 1; let PostRAScheduler = 1; + + // FIXME:Approximate 2 * branch cost. Try to hack around bad + // early-ifcvt heuristics. These need improvement to avoid the OOE + // heuristics. + int MispredictPenalty = 20; } def SIFullSpeedModel : SISchedMachineModel; diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index dd31dc690840..c5f121757e62 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -497,24 +497,24 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); if (Op32DstIdx != -1) { // dst - Inst32.addOperand(MI.getOperand(0)); + Inst32.add(MI.getOperand(0)); } else { assert(MI.getOperand(0).getReg() == AMDGPU::VCC && "Unexpected case"); } - Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (Src1) - Inst32.addOperand(*Src1); + Inst32.add(*Src1); if (Src2) { int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); if (Op32Src2Idx != -1) { - Inst32.addOperand(*Src2); + Inst32.add(*Src2); } else { // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is // replaced with an implicit read of vcc. This was already added diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 02656483cd74..5b840a14dbc3 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -226,9 +226,9 @@ def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime> def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast<LoadSDNode>(N); return Ld->getAlignment() >= 4 && - ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) || - (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); }]>; @@ -293,12 +293,6 @@ def : Pat < let Predicates = [isVI] in { -// 1. Offset as 20bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset), 0) ->; - def : Pat < (i64 (readcyclecounter)), (S_MEMREALTIME) diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index 73cd5774128e..b4adbdd1df07 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -82,6 +82,12 @@ class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo < let has_sdst = 0; } +class SOP1_0_32R <string opName, list<dag> pattern = []> : SOP1_Pseudo < + opName, (outs), (ins SReg_32:$src0), + "$src0", pattern> { + let has_sdst = 0; +} + class SOP1_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0), "$sdst, $src0", pattern @@ -210,7 +216,7 @@ def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">; def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">; } // End Uses = [M0] -def S_CBRANCH_JOIN : SOP1_1 <"s_cbranch_join">; +def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">; def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">; let Defs = [SCC] in { def S_ABS_I32 : SOP1_32 <"s_abs_i32">; @@ -428,7 +434,7 @@ def S_BFE_I64 : SOP2_64_32 <"s_bfe_i64">; def S_CBRANCH_G_FORK : SOP2_Pseudo < "s_cbranch_g_fork", (outs), - (ins SReg_64:$src0, SReg_64:$src1), + (ins SCSrc_b64:$src0, SCSrc_b64:$src1), "$src0, $src1" > { let has_sdst = 0; @@ -438,6 +444,22 @@ let Defs = [SCC] in { def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">; } // End Defs = [SCC] +let SubtargetPredicate = isVI in { + def S_RFE_RESTORE_B64 : SOP2_Pseudo < + "s_rfe_restore_b64", (outs), + (ins SSrc_b64:$src0, SSrc_b32:$src1), + "$src0, $src1" + > { + let hasSideEffects = 1; + let has_sdst = 0; + } +} + +let SubtargetPredicate = isGFX9 in { + def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">; + def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">; + def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">; +} //===----------------------------------------------------------------------===// // SOPK Instructions @@ -751,6 +773,14 @@ def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", let isReturn = 1; } +let SubtargetPredicate = isVI in { +def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> { + let simm16 = 0; + let isBarrier = 1; + let isReturn = 1; +} +} + let isBranch = 1, SchedRW = [WriteBranch] in { def S_BRANCH : SOPP < 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", @@ -792,6 +822,25 @@ def S_CBRANCH_EXECNZ : SOPP < >; } // End Uses = [EXEC] +def S_CBRANCH_CDBGSYS : SOPP < + 0x00000017, (ins sopp_brtarget:$simm16), + "s_cbranch_cdbgsys $simm16" +>; + +def S_CBRANCH_CDBGSYS_AND_USER : SOPP < + 0x0000001A, (ins sopp_brtarget:$simm16), + "s_cbranch_cdbgsys_and_user $simm16" +>; + +def S_CBRANCH_CDBGSYS_OR_USER : SOPP < + 0x00000019, (ins sopp_brtarget:$simm16), + "s_cbranch_cdbgsys_or_user $simm16" +>; + +def S_CBRANCH_CDBGUSER : SOPP < + 0x00000018, (ins sopp_brtarget:$simm16), + "s_cbranch_cdbguser $simm16" +>; } // End isBranch = 1 } // End isTerminator = 1 @@ -806,9 +855,18 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", let isConvergent = 1; } +let SubtargetPredicate = isVI in { +def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> { + let simm16 = 0; + let mayLoad = 1; + let mayStore = 1; +} +} + let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; +def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">; // On SI the documentation says sleep for approximately 64 * low 2 // bits, consistent with the reported maximum of 448. On VI the @@ -1207,6 +1265,10 @@ def S_BFE_U64_vi : SOP2_Real_vi <0x27, S_BFE_U64>; def S_BFE_I64_vi : SOP2_Real_vi <0x28, S_BFE_I64>; def S_CBRANCH_G_FORK_vi : SOP2_Real_vi <0x29, S_CBRANCH_G_FORK>; def S_ABSDIFF_I32_vi : SOP2_Real_vi <0x2a, S_ABSDIFF_I32>; +def S_PACK_LL_B32_B16_vi : SOP2_Real_vi <0x32, S_PACK_LL_B32_B16>; +def S_PACK_LH_B32_B16_vi : SOP2_Real_vi <0x33, S_PACK_LH_B32_B16>; +def S_PACK_HH_B32_B16_vi : SOP2_Real_vi <0x34, S_PACK_HH_B32_B16>; +def S_RFE_RESTORE_B64_vi : SOP2_Real_vi <0x2b, S_RFE_RESTORE_B64>; def S_MOVK_I32_vi : SOPK_Real_vi <0x00, S_MOVK_I32>; def S_CMOVK_I32_vi : SOPK_Real_vi <0x01, S_CMOVK_I32>; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5f651d4da5d2..86095a8e1142 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information--------------===// +//===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===// // // The LLVM Compiler Infrastructure // @@ -6,32 +6,42 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -#include "AMDGPUBaseInfo.h" + #include "AMDGPU.h" +#include "AMDGPUBaseInfo.h" #include "SIDefines.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/SubtargetFeature.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <utility> -#define GET_SUBTARGETINFO_ENUM -#include "AMDGPUGenSubtargetInfo.inc" -#undef GET_SUBTARGETINFO_ENUM +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#define GET_REGINFO_ENUM -#include "AMDGPUGenRegisterInfo.inc" -#undef GET_REGINFO_ENUM #define GET_INSTRINFO_NAMED_OPS -#define GET_INSTRINFO_ENUM #include "AMDGPUGenInstrInfo.inc" #undef GET_INSTRINFO_NAMED_OPS -#undef GET_INSTRINFO_ENUM namespace { @@ -56,11 +66,11 @@ unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) { return (Src & getBitMask(Shift, Width)) >> Shift; } -/// \returns Vmcnt bit shift. -unsigned getVmcntBitShift() { return 0; } +/// \returns Vmcnt bit shift (lower bits). +unsigned getVmcntBitShiftLo() { return 0; } -/// \returns Vmcnt bit width. -unsigned getVmcntBitWidth() { return 4; } +/// \returns Vmcnt bit width (lower bits). +unsigned getVmcntBitWidthLo() { return 4; } /// \returns Expcnt bit shift. unsigned getExpcntBitShift() { return 4; } @@ -74,52 +84,224 @@ unsigned getLgkmcntBitShift() { return 8; } /// \returns Lgkmcnt bit width. unsigned getLgkmcntBitWidth() { return 4; } -} // anonymous namespace +/// \returns Vmcnt bit shift (higher bits). +unsigned getVmcntBitShiftHi() { return 14; } + +/// \returns Vmcnt bit width (higher bits). +unsigned getVmcntBitWidthHi() { return 2; } + +} // end namespace anonymous namespace llvm { namespace AMDGPU { -IsaVersion getIsaVersion(const FeatureBitset &Features) { +namespace IsaInfo { +IsaVersion getIsaVersion(const FeatureBitset &Features) { + // CI. if (Features.test(FeatureISAVersion7_0_0)) return {7, 0, 0}; - if (Features.test(FeatureISAVersion7_0_1)) return {7, 0, 1}; - if (Features.test(FeatureISAVersion7_0_2)) return {7, 0, 2}; + // VI. if (Features.test(FeatureISAVersion8_0_0)) return {8, 0, 0}; - if (Features.test(FeatureISAVersion8_0_1)) return {8, 0, 1}; - if (Features.test(FeatureISAVersion8_0_2)) return {8, 0, 2}; - if (Features.test(FeatureISAVersion8_0_3)) return {8, 0, 3}; - if (Features.test(FeatureISAVersion8_0_4)) return {8, 0, 4}; - if (Features.test(FeatureISAVersion8_1_0)) return {8, 1, 0}; - return {0, 0, 0}; + // GFX9. + if (Features.test(FeatureISAVersion9_0_0)) + return {9, 0, 0}; + if (Features.test(FeatureISAVersion9_0_1)) + return {9, 0, 1}; + + if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands)) + return {0, 0, 0}; + return {7, 0, 0}; +} + +unsigned getWavefrontSize(const FeatureBitset &Features) { + if (Features.test(FeatureWavefrontSize16)) + return 16; + if (Features.test(FeatureWavefrontSize32)) + return 32; + + return 64; +} + +unsigned getLocalMemorySize(const FeatureBitset &Features) { + if (Features.test(FeatureLocalMemorySize32768)) + return 32768; + if (Features.test(FeatureLocalMemorySize65536)) + return 65536; + + return 0; +} + +unsigned getEUsPerCU(const FeatureBitset &Features) { + return 4; +} + +unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + if (!Features.test(FeatureGCN)) + return 8; + unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize); + if (N == 1) + return 40; + N = 40 / N; + return std::min(N, 16u); +} + +unsigned getMaxWavesPerCU(const FeatureBitset &Features) { + return getMaxWavesPerEU(Features) * getEUsPerCU(Features); +} + +unsigned getMaxWavesPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + return getWavesPerWorkGroup(Features, FlatWorkGroupSize); +} + +unsigned getMinWavesPerEU(const FeatureBitset &Features) { + return 1; +} + +unsigned getMaxWavesPerEU(const FeatureBitset &Features) { + if (!Features.test(FeatureGCN)) + return 8; + // FIXME: Need to take scratch memory into account. + return 10; +} + +unsigned getMaxWavesPerEU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + return alignTo(getMaxWavesPerCU(Features, FlatWorkGroupSize), + getEUsPerCU(Features)) / getEUsPerCU(Features); +} + +unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features) { + return 1; +} + +unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features) { + return 2048; +} + +unsigned getWavesPerWorkGroup(const FeatureBitset &Features, + unsigned FlatWorkGroupSize) { + return alignTo(FlatWorkGroupSize, getWavefrontSize(Features)) / + getWavefrontSize(Features); +} + +unsigned getSGPRAllocGranule(const FeatureBitset &Features) { + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) + return 16; + return 8; +} + +unsigned getSGPREncodingGranule(const FeatureBitset &Features) { + return 8; +} + +unsigned getTotalNumSGPRs(const FeatureBitset &Features) { + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) + return 800; + return 512; +} + +unsigned getAddressableNumSGPRs(const FeatureBitset &Features) { + if (Features.test(FeatureSGPRInitBug)) + return FIXED_NUM_SGPRS_FOR_INIT_BUG; + + IsaVersion Version = getIsaVersion(Features); + if (Version.Major >= 8) + return 102; + return 104; +} + +unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { + assert(WavesPerEU != 0); + + if (WavesPerEU >= getMaxWavesPerEU(Features)) + return 0; + unsigned MinNumSGPRs = + alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1), + getSGPRAllocGranule(Features)) + 1; + return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features)); +} + +unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU, + bool Addressable) { + assert(WavesPerEU != 0); + + IsaVersion Version = getIsaVersion(Features); + unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU, + getSGPRAllocGranule(Features)); + unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features); + if (Version.Major >= 8 && !Addressable) + AddressableNumSGPRs = 112; + return std::min(MaxNumSGPRs, AddressableNumSGPRs); +} + +unsigned getVGPRAllocGranule(const FeatureBitset &Features) { + return 4; +} + +unsigned getVGPREncodingGranule(const FeatureBitset &Features) { + return getVGPRAllocGranule(Features); +} + +unsigned getTotalNumVGPRs(const FeatureBitset &Features) { + return 256; } +unsigned getAddressableNumVGPRs(const FeatureBitset &Features) { + return getTotalNumVGPRs(Features); +} + +unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { + assert(WavesPerEU != 0); + + if (WavesPerEU >= getMaxWavesPerEU(Features)) + return 0; + unsigned MinNumVGPRs = + alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1), + getVGPRAllocGranule(Features)) + 1; + return std::min(MinNumVGPRs, getAddressableNumVGPRs(Features)); +} + +unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { + assert(WavesPerEU != 0); + + unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(Features) / WavesPerEU, + getVGPRAllocGranule(Features)); + unsigned AddressableNumVGPRs = getAddressableNumVGPRs(Features); + return std::min(MaxNumVGPRs, AddressableNumVGPRs); +} + +} // end namespace IsaInfo + void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features) { - - IsaVersion ISA = getIsaVersion(Features); + IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(Features); memset(&Header, 0, sizeof(Header)); Header.amd_kernel_code_version_major = 1; - Header.amd_kernel_code_version_minor = 0; + Header.amd_kernel_code_version_minor = 1; Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU Header.amd_machine_version_major = ISA.Major; Header.amd_machine_version_minor = ISA.Minor; @@ -127,6 +309,11 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.kernel_code_entry_byte_offset = sizeof(Header); // wavefront_size is specified as a power of 2: 2^6 = 64 threads. Header.wavefront_size = 6; + + // If the code object does not support indirect functions, then the value must + // be 0xffffffff. + Header.call_convention = -1; + // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. Header.kernarg_segment_alignment = 4; @@ -161,16 +348,16 @@ MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) { ELF::SHF_AMDGPU_HSA_AGENT); } -bool isGroupSegment(const GlobalValue *GV) { - return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS) { + return GV->getType()->getAddressSpace() == AS.LOCAL_ADDRESS; } -bool isGlobalSegment(const GlobalValue *GV) { - return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS) { + return GV->getType()->getAddressSpace() == AS.GLOBAL_ADDRESS; } -bool isReadOnlySegment(const GlobalValue *GV) { - return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; +bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS) { + return GV->getType()->getAddressSpace() == AS.CONSTANT_ADDRESS; } bool shouldEmitConstantsToTextSection(const Triple &TT) { @@ -208,7 +395,7 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F, return Default; } if (Strs.second.trim().getAsInteger(0, Ints.second)) { - if (!OnlyFirstRequired || Strs.second.trim().size()) { + if (!OnlyFirstRequired || !Strs.second.trim().empty()) { Ctx.emitError("can't parse second integer attribute " + Name); return Default; } @@ -217,57 +404,84 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F, return Ints; } -unsigned getWaitcntBitMask(IsaVersion Version) { - unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth()); - unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); - unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); - return Vmcnt | Expcnt | Lgkmcnt; -} +unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) { + unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1; + if (Version.Major < 9) + return VmcntLo; -unsigned getVmcntBitMask(IsaVersion Version) { - return (1 << getVmcntBitWidth()) - 1; + unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo(); + return VmcntLo | VmcntHi; } -unsigned getExpcntBitMask(IsaVersion Version) { +unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) { return (1 << getExpcntBitWidth()) - 1; } -unsigned getLgkmcntBitMask(IsaVersion Version) { +unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) { return (1 << getLgkmcntBitWidth()) - 1; } -unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt) { - return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth()); +unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) { + unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo()); + unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); + unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); + unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt; + if (Version.Major < 9) + return Waitcnt; + + unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi()); + return Waitcnt | VmcntHi; +} + +unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { + unsigned VmcntLo = + unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo()); + if (Version.Major < 9) + return VmcntLo; + + unsigned VmcntHi = + unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi()); + VmcntHi <<= getVmcntBitWidthLo(); + return VmcntLo | VmcntHi; } -unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt) { +unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); } -unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt) { +unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); } -void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt, +void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) { Vmcnt = decodeVmcnt(Version, Waitcnt); Expcnt = decodeExpcnt(Version, Waitcnt); Lgkmcnt = decodeLgkmcnt(Version, Waitcnt); } -unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt) { - return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth()); +unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Vmcnt) { + Waitcnt = + packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo()); + if (Version.Major < 9) + return Waitcnt; + + Vmcnt >>= getVmcntBitWidthLo(); + return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi()); } -unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt) { +unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Expcnt) { return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); } -unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) { +unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Lgkmcnt) { return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); } -unsigned encodeWaitcnt(IsaVersion Version, +unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) { unsigned Waitcnt = getWaitcntBitMask(Version); Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt); @@ -296,6 +510,10 @@ bool isCompute(CallingConv::ID cc) { return !isShader(cc) || cc == CallingConv::AMDGPU_CS; } +bool isEntryFunctionCC(CallingConv::ID CC) { + return true; +} + bool isSI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; } @@ -327,13 +545,34 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { return Reg; } +unsigned mc2PseudoReg(unsigned Reg) { + switch (Reg) { + case AMDGPU::FLAT_SCR_ci: + case AMDGPU::FLAT_SCR_vi: + return FLAT_SCR; + + case AMDGPU::FLAT_SCR_LO_ci: + case AMDGPU::FLAT_SCR_LO_vi: + return AMDGPU::FLAT_SCR_LO; + + case AMDGPU::FLAT_SCR_HI_ci: + case AMDGPU::FLAT_SCR_HI_vi: + return AMDGPU::FLAT_SCR_HI; + + default: + return Reg; + } +} + bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { + assert(OpNo < Desc.NumOperands); unsigned OpType = Desc.OpInfo[OpNo].OperandType; return OpType >= AMDGPU::OPERAND_SRC_FIRST && OpType <= AMDGPU::OPERAND_SRC_LAST; } bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { + assert(OpNo < Desc.NumOperands); unsigned OpType = Desc.OpInfo[OpNo].OperandType; switch (OpType) { case AMDGPU::OPERAND_REG_IMM_FP32: @@ -342,6 +581,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: return true; default: return false; @@ -349,6 +589,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { } bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) { + assert(OpNo < Desc.NumOperands); unsigned OpType = Desc.OpInfo[OpNo].OperandType; return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST && OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST; @@ -392,6 +633,7 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) { unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, unsigned OpNo) { + assert(OpNo < Desc.NumOperands); unsigned RCID = Desc.OpInfo[OpNo].RegClass; return getRegBitWidth(MRI->getRegClass(RCID)) / 8; } @@ -440,7 +682,8 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) { } bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { - assert(HasInv2Pi); + if (!HasInv2Pi) + return false; if (Literal >= -16 && Literal <= 64) return true; @@ -457,5 +700,92 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { Val == 0x3118; // 1/2pi } -} // End namespace AMDGPU -} // End namespace llvm +bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { + assert(HasInv2Pi); + + int16_t Lo16 = static_cast<int16_t>(Literal); + int16_t Hi16 = static_cast<int16_t>(Literal >> 16); + return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); +} + +bool isUniformMMO(const MachineMemOperand *MMO) { + const Value *Ptr = MMO->getValue(); + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || + isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) + return true; + + const Instruction *I = dyn_cast<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + +int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { + if (isSI(ST) || isCI(ST)) + return ByteOffset >> 2; + + return ByteOffset; +} + +bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { + int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset); + return isSI(ST) || isCI(ST) ? isUInt<8>(EncodedOffset) : + isUInt<20>(EncodedOffset); +} +} // end namespace AMDGPU + +} // end namespace llvm + +const unsigned AMDGPUAS::MAX_COMMON_ADDRESS; +const unsigned AMDGPUAS::GLOBAL_ADDRESS; +const unsigned AMDGPUAS::LOCAL_ADDRESS; +const unsigned AMDGPUAS::PARAM_D_ADDRESS; +const unsigned AMDGPUAS::PARAM_I_ADDRESS; +const unsigned AMDGPUAS::CONSTANT_BUFFER_0; +const unsigned AMDGPUAS::CONSTANT_BUFFER_1; +const unsigned AMDGPUAS::CONSTANT_BUFFER_2; +const unsigned AMDGPUAS::CONSTANT_BUFFER_3; +const unsigned AMDGPUAS::CONSTANT_BUFFER_4; +const unsigned AMDGPUAS::CONSTANT_BUFFER_5; +const unsigned AMDGPUAS::CONSTANT_BUFFER_6; +const unsigned AMDGPUAS::CONSTANT_BUFFER_7; +const unsigned AMDGPUAS::CONSTANT_BUFFER_8; +const unsigned AMDGPUAS::CONSTANT_BUFFER_9; +const unsigned AMDGPUAS::CONSTANT_BUFFER_10; +const unsigned AMDGPUAS::CONSTANT_BUFFER_11; +const unsigned AMDGPUAS::CONSTANT_BUFFER_12; +const unsigned AMDGPUAS::CONSTANT_BUFFER_13; +const unsigned AMDGPUAS::CONSTANT_BUFFER_14; +const unsigned AMDGPUAS::CONSTANT_BUFFER_15; +const unsigned AMDGPUAS::UNKNOWN_ADDRESS_SPACE; + +namespace llvm { +namespace AMDGPU { + +AMDGPUAS getAMDGPUAS(Triple T) { + auto Env = T.getEnvironmentName(); + AMDGPUAS AS; + if (Env == "amdgiz" || Env == "amdgizcl") { + AS.FLAT_ADDRESS = 0; + AS.PRIVATE_ADDRESS = 5; + AS.REGION_ADDRESS = 4; + } + else { + AS.FLAT_ADDRESS = 4; + AS.PRIVATE_ADDRESS = 0; + AS.REGION_ADDRESS = 5; + } + return AS; +} + +AMDGPUAS getAMDGPUAS(const TargetMachine &M) { + return getAMDGPUAS(M.getTargetTriple()); +} + +AMDGPUAS getAMDGPUAS(const Module &M) { + return getAMDGPUAS(Triple(M.getTargetTriple())); +} +} // namespace AMDGPU +} // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index ea5fc366d205..d6c836eb748b 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1,4 +1,4 @@ -//===-- AMDGPUBaseInfo.h - Top level definitions for AMDGPU -----*- C++ -*-===// +//===- AMDGPUBaseInfo.h - Top level definitions for AMDGPU ------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -10,39 +10,143 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H +#include "AMDGPU.h" #include "AMDKernelCodeT.h" -#include "llvm/IR/CallingConv.h" - #include "SIDefines.h" - -#define GET_INSTRINFO_OPERAND_ENUM -#include "AMDGPUGenInstrInfo.inc" -#undef GET_INSTRINFO_OPERAND_ENUM +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include <cstdint> +#include <utility> namespace llvm { class FeatureBitset; class Function; class GlobalValue; +class MachineMemOperand; class MCContext; -class MCInstrDesc; class MCRegisterClass; class MCRegisterInfo; class MCSection; class MCSubtargetInfo; +class Triple; namespace AMDGPU { +namespace IsaInfo { -LLVM_READONLY -int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); +enum { + // The closed Vulkan driver sets 96, which limits the wave count to 8 but + // doesn't spill SGPRs as much as when 80 is set. + FIXED_NUM_SGPRS_FOR_INIT_BUG = 96 +}; +/// \brief Instruction set architecture version. struct IsaVersion { unsigned Major; unsigned Minor; unsigned Stepping; }; +/// \returns Isa version for given subtarget \p Features. IsaVersion getIsaVersion(const FeatureBitset &Features); + +/// \returns Wavefront size for given subtarget \p Features. +unsigned getWavefrontSize(const FeatureBitset &Features); + +/// \returns Local memory size in bytes for given subtarget \p Features. +unsigned getLocalMemorySize(const FeatureBitset &Features); + +/// \returns Number of execution units per compute unit for given subtarget \p +/// Features. +unsigned getEUsPerCU(const FeatureBitset &Features); + +/// \returns Maximum number of work groups per compute unit for given subtarget +/// \p Features and limited by given \p FlatWorkGroupSize. +unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns Maximum number of waves per compute unit for given subtarget \p +/// Features without any kind of limitation. +unsigned getMaxWavesPerCU(const FeatureBitset &Features); + +/// \returns Maximum number of waves per compute unit for given subtarget \p +/// Features and limited by given \p FlatWorkGroupSize. +unsigned getMaxWavesPerCU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns Minimum number of waves per execution unit for given subtarget \p +/// Features. +unsigned getMinWavesPerEU(const FeatureBitset &Features); + +/// \returns Maximum number of waves per execution unit for given subtarget \p +/// Features without any kind of limitation. +unsigned getMaxWavesPerEU(const FeatureBitset &Features); + +/// \returns Maximum number of waves per execution unit for given subtarget \p +/// Features and limited by given \p FlatWorkGroupSize. +unsigned getMaxWavesPerEU(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns Minimum flat work group size for given subtarget \p Features. +unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features); + +/// \returns Maximum flat work group size for given subtarget \p Features. +unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features); + +/// \returns Number of waves per work group for given subtarget \p Features and +/// limited by given \p FlatWorkGroupSize. +unsigned getWavesPerWorkGroup(const FeatureBitset &Features, + unsigned FlatWorkGroupSize); + +/// \returns SGPR allocation granularity for given subtarget \p Features. +unsigned getSGPRAllocGranule(const FeatureBitset &Features); + +/// \returns SGPR encoding granularity for given subtarget \p Features. +unsigned getSGPREncodingGranule(const FeatureBitset &Features); + +/// \returns Total number of SGPRs for given subtarget \p Features. +unsigned getTotalNumSGPRs(const FeatureBitset &Features); + +/// \returns Addressable number of SGPRs for given subtarget \p Features. +unsigned getAddressableNumSGPRs(const FeatureBitset &Features); + +/// \returns Minimum number of SGPRs that meets the given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU); + +/// \returns Maximum number of SGPRs that meets the given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU, + bool Addressable); + +/// \returns VGPR allocation granularity for given subtarget \p Features. +unsigned getVGPRAllocGranule(const FeatureBitset &Features); + +/// \returns VGPR encoding granularity for given subtarget \p Features. +unsigned getVGPREncodingGranule(const FeatureBitset &Features); + +/// \returns Total number of VGPRs for given subtarget \p Features. +unsigned getTotalNumVGPRs(const FeatureBitset &Features); + +/// \returns Addressable number of VGPRs for given subtarget \p Features. +unsigned getAddressableNumVGPRs(const FeatureBitset &Features); + +/// \returns Minimum number of VGPRs that meets given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); + +/// \returns Maximum number of VGPRs that meets given number of waves per +/// execution unit requirement for given subtarget \p Features. +unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); + +} // end namespace IsaInfo + +LLVM_READONLY +int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); + void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features); MCSection *getHSATextSection(MCContext &Ctx); @@ -53,9 +157,9 @@ MCSection *getHSADataGlobalProgramSection(MCContext &Ctx); MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx); -bool isGroupSegment(const GlobalValue *GV); -bool isGlobalSegment(const GlobalValue *GV); -bool isReadOnlySegment(const GlobalValue *GV); +bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS); +bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS); +bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS); /// \returns True if constants should be emitted to .text section for given /// target triple \p TT, false otherwise. @@ -83,64 +187,89 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F, std::pair<int, int> Default, bool OnlyFirstRequired = false); -/// \returns Waitcnt bit mask for given isa \p Version. -unsigned getWaitcntBitMask(IsaVersion Version); - /// \returns Vmcnt bit mask for given isa \p Version. -unsigned getVmcntBitMask(IsaVersion Version); +unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version); /// \returns Expcnt bit mask for given isa \p Version. -unsigned getExpcntBitMask(IsaVersion Version); +unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version); /// \returns Lgkmcnt bit mask for given isa \p Version. -unsigned getLgkmcntBitMask(IsaVersion Version); +unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version); + +/// \returns Waitcnt bit mask for given isa \p Version. +unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version); /// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt); +unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt); +unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt); +unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and /// \p Lgkmcnt respectively. /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows: -/// \p Vmcnt = \p Waitcnt[3:0] +/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only) +/// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only) /// \p Expcnt = \p Waitcnt[6:4] /// \p Lgkmcnt = \p Waitcnt[11:8] -void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt, +void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version. -unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt); +unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Vmcnt); /// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version. -unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt); +unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Expcnt); /// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version. -unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt); +unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, + unsigned Lgkmcnt); /// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa /// \p Version. /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows: -/// Waitcnt[3:0] = \p Vmcnt -/// Waitcnt[6:4] = \p Expcnt -/// Waitcnt[11:8] = \p Lgkmcnt +/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only) +/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only) +/// Waitcnt[6:4] = \p Expcnt +/// Waitcnt[11:8] = \p Lgkmcnt +/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only) /// /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given /// isa \p Version. -unsigned encodeWaitcnt(IsaVersion Version, +unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt); unsigned getInitialPSInputAddr(const Function &F); -bool isShader(CallingConv::ID cc); -bool isCompute(CallingConv::ID cc); +LLVM_READNONE +bool isShader(CallingConv::ID CC); + +LLVM_READNONE +bool isCompute(CallingConv::ID CC); + +LLVM_READNONE +bool isEntryFunctionCC(CallingConv::ID CC); + +// FIXME: Remove this when calling conventions cleaned up +LLVM_READNONE +inline bool isKernel(CallingConv::ID CC) { + switch (CC) { + case CallingConv::C: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + default: + return false; + } +} bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); @@ -150,6 +279,10 @@ bool isVI(const MCSubtargetInfo &STI); /// \p STI otherwise return \p Reg. unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); +/// \brief Convert hardware register \p Reg to a pseudo register +LLVM_READNONE +unsigned mc2PseudoReg(unsigned Reg); + /// \brief Can this operand also contain immediate values? bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo); @@ -188,6 +321,8 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: return 2; default: @@ -210,7 +345,21 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi); LLVM_READNONE bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); +LLVM_READNONE +bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); + +bool isUniformMMO(const MachineMemOperand *MMO); + +/// \returns The encoding that will be used for \p ByteOffset in the SMRD +/// offset field. +int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); + +/// \returns true if this offset is small enough to fit in the SMRD +/// offset field. \p ByteOffset should be the offset in bytes and +/// not the encoded offset. +bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); + } // end namespace AMDGPU } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h index c55eaab077d1..991408c81c92 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -87,7 +87,7 @@ COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE // TODO: cdbg_user COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN), COMPPGM2(user_sgpr_count, compute_pgm_rsrc2_user_sgpr, USER_SGPR), -// TODO: enable_trap_handler +COMPPGM2(enable_trap_handler, compute_pgm_rsrc2_trap_handler, TRAP_HANDLER), COMPPGM2(enable_sgpr_workgroup_id_x, compute_pgm_rsrc2_tgid_x_en, TGID_X_EN), COMPPGM2(enable_sgpr_workgroup_id_y, compute_pgm_rsrc2_tgid_y_en, TGID_Y_EN), COMPPGM2(enable_sgpr_workgroup_id_z, compute_pgm_rsrc2_tgid_z_en, TGID_Z_EN), diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 8cae83cd9d1a..1febc6bf8ec2 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -23,18 +23,18 @@ class VOP1e <bits<8> op, VOPProfile P> : Enc32 { class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { bits<8> vdst; - + let Inst{8-0} = 0xf9; // sdwa let Inst{16-9} = op; let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); let Inst{31-25} = 0x3f; // encoding } -class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : +class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> : InstSI <P.Outs32, P.Ins32, "", pattern>, VOP <opName>, - SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>, - MnemonicAlias<opName#"_e32", opName> { + SIMCInstr <!if(VOP1Only, opName, opName#"_e32"), SIEncodingFamily.NONE>, + MnemonicAlias<!if(VOP1Only, opName, opName#"_e32"), opName> { let isPseudo = 1; let isCodeGenOnly = 1; @@ -75,6 +75,8 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let Uses = ps.Uses; } class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -83,10 +85,17 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : } class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { - list<dag> ret = !if(P.HasModifiers, - [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, - i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]); + list<dag> ret = + !if(P.HasModifiers, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, + i1:$clamp, i32:$omod))))], + !if(P.HasOMod, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0, + i1:$clamp, i32:$omod))))], + [(set P.DstVT:$vdst, (node P.Src0VT:$src0))] + ) + ); } multiclass VOP1Inst <string opName, VOPProfile P, @@ -96,6 +105,23 @@ multiclass VOP1Inst <string opName, VOPProfile P, def _sdwa : VOP1_SDWA_Pseudo <opName, P>; } +// Special profile for instructions which have clamp +// and output modifiers (but have no input modifiers) +class VOPProfileI2F<ValueType dstVt, ValueType srcVt> : + VOPProfile<[dstVt, srcVt, untyped, untyped]> { + + let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod); + let Asm64 = "$vdst, $src0$clamp$omod"; + + let HasModifiers = 0; + let HasClamp = 1; + let HasOMod = 1; +} + +def VOP1_F64_I32 : VOPProfileI2F <f64, i32>; +def VOP1_F32_I32 : VOPProfileI2F <f32, i32>; +def VOP1_F16_I16 : VOPProfileI2F <f16, i16>; + //===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// @@ -142,24 +168,24 @@ def V_READFIRSTLANE_B32 : let SchedRW = [WriteQuarterRate32] in { defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>; -defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>; -defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>; -defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>; +defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>; +defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>; +defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; -defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>; -defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>; +defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; +defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; -defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>; +defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>; defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>; -defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>; -defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>; -defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>; -defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>; +defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>; +defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>; +defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>; +defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>; defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>; -defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>; +defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>; } // End SchedRW = [WriteQuarterRate32] defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>; @@ -237,7 +263,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { src0_sel:$src0_sel); let Asm32 = getAsm32<1, 1>.ret; - let Asm64 = getAsm64<1, 1, 0>.ret; + let Asm64 = getAsm64<1, 1, 0, 1>.ret; let AsmDPP = getAsmDPP<1, 1, 0>.ret; let AsmSDWA = getAsmSDWA<1, 1, 0>.ret; @@ -258,11 +284,14 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>; defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>; } // End Uses = [M0, EXEC] +let SchedRW = [WriteQuarterRate32] in { +defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>; +} + // These instruction only exist on SI and CI let SubtargetPredicate = isSICI in { let SchedRW = [WriteQuarterRate32] in { -defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>; defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>; defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>; defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>; @@ -297,8 +326,8 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; let SubtargetPredicate = isVI in { -defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>; -defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>; +defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; +defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; @@ -326,12 +355,31 @@ def : Pat< >; def : Pat< - (i16 (fp_to_f16 f32:$src)), + (i16 (AMDGPUfp_to_f16 f32:$src)), (V_CVT_F16_F32_e32 $src) >; } +def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> { + let Outs32 = (outs VGPR_32:$vdst, VGPR_32:$vdst1); + let Ins32 = (ins VGPR_32:$src0, VGPR_32:$src1); + let Outs64 = Outs32; + let Asm32 = " $vdst, $src0"; + let Asm64 = ""; + let Ins64 = (ins); +} + +let SubtargetPredicate = isGFX9 in { + let Constraints = "$vdst = $src1, $vdst1 = $src0", + DisableEncoding="$vdst1,$src1", + SchedRW = [Write64Bit, Write64Bit] in { +// Never VOP3. Takes as long as 2 v_mov_b32s +def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>; +} + +} // End SubtargetPredicate = isGFX9 + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// @@ -453,6 +501,14 @@ class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> : let Inst{31-25} = 0x3f; //encoding } +multiclass VOP1Only_Real_vi <bits<10> op> { + let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { + def _vi : + VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>; + } +} + multiclass VOP1_Real_vi <bits<10> op> { let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { def _e32_vi : @@ -480,6 +536,7 @@ defm V_CVT_F32_I32 : VOP1_Real_vi <0x5>; defm V_CVT_F32_U32 : VOP1_Real_vi <0x6>; defm V_CVT_U32_F32 : VOP1_Real_vi <0x7>; defm V_CVT_I32_F32 : VOP1_Real_vi <0x8>; +defm V_MOV_FED_B32 : VOP1_Real_vi <0x9>; defm V_CVT_F16_F32 : VOP1_Real_vi <0xa>; defm V_CVT_F32_F16 : VOP1_Real_vi <0xb>; defm V_CVT_RPI_I32_F32 : VOP1_Real_vi <0xc>; @@ -547,7 +604,7 @@ defm V_RNDNE_F16 : VOP1_Real_vi <0x47>; defm V_FRACT_F16 : VOP1_Real_vi <0x48>; defm V_SIN_F16 : VOP1_Real_vi <0x49>; defm V_COS_F16 : VOP1_Real_vi <0x4a>; - +defm V_SWAP_B32 : VOP1Only_Real_vi <0x51>; // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR // indexing mode. vdst can't be treated as a def for codegen purposes, diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 00e5ab3db0b7..2281f338ab45 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -40,7 +40,7 @@ class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 { class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> { bits<8> vdst; bits<8> src1; - + let Inst{8-0} = 0xf9; // sdwa let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); @@ -93,6 +93,8 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let Uses = ps.Uses; } class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -119,8 +121,7 @@ multiclass VOP2Inst <string opName, def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; - def _sdwa : VOP2_SDWA_Pseudo <opName, P>, - Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>; + def _sdwa : VOP2_SDWA_Pseudo <opName, P>; } // TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst @@ -134,10 +135,10 @@ multiclass VOP2bInst <string opName, let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { def _e32 : VOP2_Pseudo <opName, P>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; - - def _sdwa : VOP2_SDWA_Pseudo <opName, P>, - Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>; + + def _sdwa : VOP2_SDWA_Pseudo <opName, P>; } + def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; } @@ -154,6 +155,7 @@ multiclass VOP2eInst <string opName, def _e32 : VOP2_Pseudo <opName, P>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; } + def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; } @@ -179,10 +181,12 @@ class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { def VOP_MADMK_F16 : VOP_MADMK <f16>; def VOP_MADMK_F32 : VOP_MADMK <f32>; +// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory +// and processing time but it makes it easier to convert to mad. class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, - HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret; + HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, VGPR_32:$src2, // stub argument @@ -194,6 +198,7 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm32 = getAsm32<1, 2, vt>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret; let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret; let HasSrc2 = 0; @@ -204,13 +209,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { def VOP_MAC_F16 : VOP_MAC <f16> { // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives // 'not a string initializer' error. - let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f16>.ret; } def VOP_MAC_F32 : VOP_MAC <f32> { // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives // 'not a string initializer' error. - let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f32>.ret; } // Write out to vcc or arbitrary SGPR. @@ -280,7 +285,7 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> { def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { let Outs32 = (outs VGPR_32:$vdst); let Outs64 = Outs32; - let Ins32 = (ins SReg_32:$src0, SCSrc_b32:$src1); + let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1); let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; @@ -354,7 +359,7 @@ defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>; defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst" defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>; defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>; -defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>; +defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>; defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>; defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>; @@ -494,6 +499,14 @@ def : Pat < (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src) >; +// Undo sub x, c -> add x, -c canonicalization since c is more likely +// an inline immediate than -c. +// TODO: Also do for 64-bit. +def : Pat< + (add i16:$src0, (i16 NegSubInlineConst16:$src1)), + (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) +>; + } // End Predicates = [isVI] //===----------------------------------------------------------------------===// @@ -566,7 +579,10 @@ defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>; defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>; defm V_READLANE_B32 : VOP2_Real_si <0x01>; + +let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in { defm V_WRITELANE_B32 : VOP2_Real_si <0x02>; +} defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>; defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>; @@ -646,7 +662,7 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> : VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>; } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" - + multiclass VOP2_SDWA_Real <bits<6> op> { def _sdwa_vi : VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index c2a4d4ba99b1..217a07488853 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -29,6 +29,26 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { ret1)); } +class getVOP3PModPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret3 = [(set P.DstVT:$vdst, + (node (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))]; + + list<dag> ret2 = [(set P.DstVT:$vdst, + (node !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), + (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))]; + + list<dag> ret1 = [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; + + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + class getVOP3Pat<VOPProfile P, SDPatternOperator node> { list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))]; list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]; @@ -86,6 +106,14 @@ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> { let DstRC = RegisterOperand<VReg_64>; } +def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VReg_64>; + + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Asm64 = " $vdst, $sdst, $src0, $src1, $src2"; +} + //===----------------------------------------------------------------------===// // VOP3 Instructions //===----------------------------------------------------------------------===// @@ -209,10 +237,8 @@ def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I3 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>; let isCommutable = 1 in { -def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3_Profile<VOP_I64_I32_I32_I64>>; - -// XXX - Does this set VCC? -def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3_Profile<VOP_I64_I32_I32_I64>>; +def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; +def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } // End isCommutable = 1 } // End SubtargetPredicate = isCIVI @@ -234,12 +260,14 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>; } // End isCommutable = 1 +def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + } // End SubtargetPredicate = isVI let Predicates = [isVI] in { -multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, - Instruction inst, SDPatternOperator op3> { +multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, + Instruction inst, SDPatternOperator op3> { def : Pat< (op2 (op1 i16:$src0, i16:$src1), i16:$src2), (inst i16:$src0, i16:$src1, i16:$src2) @@ -258,11 +286,26 @@ def : Pat< >; } -defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>; -defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>; +defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>; +defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>; } // End Predicates = [isVI] +let SubtargetPredicate = isGFX9 in { +def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>; +def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + +def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>; +def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>; +def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>; +} + //===----------------------------------------------------------------------===// // Target @@ -351,11 +394,19 @@ multiclass VOP3_Real_ci<bits<9> op> { } } +multiclass VOP3be_Real_ci<bits<9> op> { + def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>, + VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> { + let AssemblerPredicates = [isCIOnly]; + let DecoderNamespace = "CI"; + } +} + defm V_MQSAD_U16_U8 : VOP3_Real_ci <0x172>; defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>; -defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x174>; -defm V_MAD_U64_U32 : VOP3_Real_ci <0x176>; -defm V_MAD_I64_I32 : VOP3_Real_ci <0x177>; +defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x175>; +defm V_MAD_U64_U32 : VOP3be_Real_ci <0x176>; +defm V_MAD_I64_I32 : VOP3be_Real_ci <0x177>; //===----------------------------------------------------------------------===// // VI @@ -376,8 +427,8 @@ multiclass VOP3be_Real_vi<bits<10> op> { } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" defm V_MQSAD_U16_U8 : VOP3_Real_vi <0x172>; -defm V_MAD_U64_U32 : VOP3_Real_vi <0x176>; -defm V_MAD_I64_I32 : VOP3_Real_vi <0x177>; +defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; +defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>; defm V_MAD_LEGACY_F32 : VOP3_Real_vi <0x1c0>; defm V_MAD_F32 : VOP3_Real_vi <0x1c1>; @@ -424,6 +475,8 @@ defm V_MAD_F16 : VOP3_Real_vi <0x1ea>; defm V_MAD_U16 : VOP3_Real_vi <0x1eb>; defm V_MAD_I16 : VOP3_Real_vi <0x1ec>; +defm V_PERM_B32 : VOP3_Real_vi <0x1ed>; + defm V_FMA_F16 : VOP3_Real_vi <0x1ee>; defm V_DIV_FIXUP_F16 : VOP3_Real_vi <0x1ef>; @@ -449,3 +502,16 @@ defm V_LSHLREV_B64 : VOP3_Real_vi <0x28f>; defm V_LSHRREV_B64 : VOP3_Real_vi <0x290>; defm V_ASHRREV_I64 : VOP3_Real_vi <0x291>; defm V_TRIG_PREOP_F64 : VOP3_Real_vi <0x292>; + +defm V_LSHL_ADD_U32 : VOP3_Real_vi <0x1fd>; +defm V_ADD_LSHL_U32 : VOP3_Real_vi <0x1fe>; +defm V_ADD3_U32 : VOP3_Real_vi <0x1ff>; +defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>; +defm V_AND_OR_B32 : VOP3_Real_vi <0x201>; +defm V_OR3_B32 : VOP3_Real_vi <0x202>; +defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>; + +defm V_XAD_U32 : VOP3_Real_vi <0x1f3>; +defm V_MED3_F16 : VOP3_Real_vi <0x1fa>; +defm V_MED3_I16 : VOP3_Real_vi <0x1fb>; +defm V_MED3_U16 : VOP3_Real_vi <0x1fc>; diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td new file mode 100644 index 000000000000..96d343099132 --- /dev/null +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -0,0 +1,82 @@ +//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VOP3P Classes +//===----------------------------------------------------------------------===// + +class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> : + VOP3P_Pseudo<OpName, P, + !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret) +>; + +// Non-packed instructions that use the VOP3P encoding. i.e. where +// omod/abs are used. +class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> : + VOP3P_Pseudo<OpName, P, + !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret) +>; + +let isCommutable = 1 in { +def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>; +def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>; +def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>; +def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>; +def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>; + +def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; +def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; +def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; +def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>; + +def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>; +def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>; +def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>; +def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>; +} + +def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>; +def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; +def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; + +// XXX - Commutable? +def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; +def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; +def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; + + +multiclass VOP3P_Real_vi<bits<10> op> { + def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3Pe <op, !cast<VOP3P_Pseudo>(NAME).Pfl> { + let AssemblerPredicates = [HasVOP3PInsts]; + let DecoderNamespace = "VI"; + } +} + +defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>; +defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>; +defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>; +defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>; +defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>; +defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>; +defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>; +defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>; + +defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>; +defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>; +defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>; +defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>; +defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>; +defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>; +defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>; +defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>; + +defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>; +defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; +defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index 16a456da3c67..a3550a63677b 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -93,6 +93,8 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let Uses = ps.Uses; } class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -165,13 +167,11 @@ multiclass VOPC_Pseudos <string opName, let isCommutable = 1; } - def _sdwa : VOPC_SDWA_Pseudo <opName, P>, - Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)> { + def _sdwa : VOPC_SDWA_Pseudo <opName, P> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; let isConvergent = DefExec; let isCompare = 1; - let isCommutable = 1; } } @@ -563,7 +563,7 @@ multiclass VOPC_CLASS_F16 <string opName> : VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>; multiclass VOPCX_CLASS_F16 <string opName> : - VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>; + VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 1>; multiclass VOPC_CLASS_F32 <string opName> : VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>; diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 5f72f97d9e28..69906c419db3 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -68,8 +68,9 @@ class VOP3Common <dag outs, dag ins, string asm = "", let hasPostISelHook = 1; } -class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3Only = 0> : - InstSI <P.Outs64, P.Ins64, "", pattern>, +class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], + bit VOP3Only = 0, bit isVOP3P = 0> : + InstSI <P.Outs64, !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64), "", pattern>, VOP <opName>, SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>, MnemonicAlias<opName#"_e64", opName> { @@ -79,7 +80,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3On let UseNamedOperandTable = 1; string Mnemonic = opName; - string AsmOperands = P.Asm64; + string AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64); let Size = 8; let mayLoad = 0; @@ -100,23 +101,34 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3On let VOP3 = 1; let VALU = 1; + let FPClamp = P.HasFPClamp; let Uses = [EXEC]; let AsmVariantName = AMDGPUAsmVariants.VOP3; let AsmMatchConverter = !if(!eq(VOP3Only,1), - "cvtVOP3", - !if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", "")); + !if(!and(P.IsPacked, isVOP3P), "cvtVOP3P", "cvtVOP3"), + !if(!eq(P.HasModifiers, 1), + "cvtVOP3_2_mod", + !if(!eq(P.HasOMod, 1), "cvtVOP3OMod", "") + ) + ); VOPProfile Pfl = P; } +class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> : + VOP3_Pseudo<opName, P, pattern, 1, 1> { + let VOP3P = 1; +} + class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { let isPseudo = 0; let isCodeGenOnly = 0; + let UseNamedOperandTable = 1; let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; @@ -128,8 +140,15 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let Uses = ps.Uses; } +// XXX - Is there any reason to distingusih this from regular VOP3 +// here? +class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> : + VOP3_Real<ps, EncodingFamily>; + class VOP3a<VOPProfile P> : Enc64 { bits<2> src0_modifiers; bits<9> src0; @@ -197,6 +216,42 @@ class VOP3be <VOPProfile P> : Enc64 { let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); } +class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 { + bits<8> vdst; + // neg, neg_hi, op_sel put in srcN_modifiers + bits<4> src0_modifiers; + bits<9> src0; + bits<4> src1_modifiers; + bits<9> src1; + bits<4> src2_modifiers; + bits<9> src2; + bits<1> clamp; + + let Inst{7-0} = vdst; + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 + let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 + + let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0) + let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1) + let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2) + + let Inst{14} = !if(P.HasOpSel, src2_modifiers{3}, 0); // op_sel_hi(2) + + let Inst{15} = !if(P.HasClamp, clamp{0}, 0); + + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = !if(P.HasSrc0, src0, 0); + let Inst{49-41} = !if(P.HasSrc1, src1, 0); + let Inst{58-50} = !if(P.HasSrc2, src2, 0); + let Inst{59} = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel_hi(0) + let Inst{60} = !if(P.HasOpSel, src1_modifiers{3}, 0); // op_sel_hi(1) + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) + let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) +} + class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> { let Inst{25-17} = op; } @@ -250,7 +305,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : VOP <opName>, SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>, MnemonicAlias <opName#"_sdwa", opName> { - + let isPseudo = 1; let isCodeGenOnly = 1; let UseNamedOperandTable = 1; @@ -261,14 +316,14 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : let Size = 8; let mayLoad = 0; let mayStore = 0; - let hasSideEffects = 0; + let hasSideEffects = 0; let VALU = 1; let SDWA = 1; let Uses = [EXEC]; - - let SubtargetPredicate = isVI; - let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst); + + let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst); + let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst); let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA, AMDGPUAsmVariants.Disable); let DecoderNamespace = "SDWA"; @@ -337,8 +392,8 @@ class VOP_DPP <string OpName, VOPProfile P> : let Size = 8; let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); - let SubtargetPredicate = isVI; - let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst); + let SubtargetPredicate = HasDPP; + let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst); let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let DecoderNamespace = "DPP"; @@ -348,3 +403,4 @@ include "VOPCInstructions.td" include "VOP1Instructions.td" include "VOP2Instructions.td" include "VOP3Instructions.td" +include "VOP3PInstructions.td" |