diff options
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 102 |
1 files changed, 58 insertions, 44 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2042dbf6d5e2..2205819c444f 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief The AMDGPU target machine contains all of the hardware specific +/// The AMDGPU target machine contains all of the hardware specific /// information needed to emit code for R600 and SI GPUs. // //===----------------------------------------------------------------------===// @@ -31,7 +31,6 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -40,6 +39,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" @@ -79,7 +79,7 @@ static cl::opt<bool> EnableLoadStoreVectorizer( cl::init(true), cl::Hidden); -// Option to to control global loads scalarization +// Option to control global loads scalarization static cl::opt<bool> ScalarizeGlobal( "amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), @@ -110,12 +110,6 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true)); -// Option to enable new waitcnt insertion pass. -static cl::opt<bool> EnableSIInsertWaitcntsPass( - "enable-si-insert-waitcnts", - cl::desc("Use new waitcnt insertion pass"), - cl::init(true)); - // Option to run late CFG structurizer static cl::opt<bool, true> LateCFGStructurize( "amdgpu-late-structurize", @@ -123,16 +117,23 @@ static cl::opt<bool, true> LateCFGStructurize( cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden); -static cl::opt<bool> EnableAMDGPUFunctionCalls( +static cl::opt<bool, true> EnableAMDGPUFunctionCalls( "amdgpu-function-calls", - cl::Hidden, cl::desc("Enable AMDGPU function call support"), - cl::init(false)); + cl::location(AMDGPUTargetMachine::EnableFunctionCalls), + cl::init(false), + cl::Hidden); // Enable lib calls simplifications static cl::opt<bool> EnableLibCallSimplify( "amdgpu-simplify-libcall", - cl::desc("Enable mdgpu library simplifications"), + cl::desc("Enable amdgpu library simplifications"), + cl::init(true), + cl::Hidden); + +static cl::opt<bool> EnableLowerKernelArguments( + "amdgpu-ir-lower-kernel-arguments", + cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), cl::Hidden); @@ -147,6 +148,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeR600PacketizerPass(*PR); initializeR600ExpandSpecialInstrsPassPass(*PR); initializeR600VectorRegMergerPass(*PR); + initializeGlobalISel(*PR); initializeAMDGPUDAGToDAGISelPass(*PR); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); @@ -160,6 +162,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); + initializeAMDGPULowerKernelArgumentsPass(*PR); + initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); @@ -167,7 +171,6 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); - initializeSIInsertWaitsPass(*PR); initializeSIInsertWaitcntsPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); @@ -176,6 +179,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); initializeSIFixWWMLivenessPass(*PR); + initializeSIFormMemoryClausesPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUUseNativeCallsPass(*PR); @@ -260,24 +264,15 @@ GCNILPSchedRegistry("gcn-ilp", static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. - if (TT.getEnvironmentName() == "amdgiz" || - TT.getEnvironmentName() == "amdgizcl") return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; - return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"; } // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - if (TT.getEnvironmentName() == "amdgiz" || - TT.getEnvironmentName() == "amdgizcl") - return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" + return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" - "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"; } LLVM_READNONE @@ -317,9 +312,10 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, initAsmInfo(); } -AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; - bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; +bool AMDGPUTargetMachine::EnableFunctionCalls = false; + +AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { Attribute GPUAttr = F.getFnAttribute("target-cpu"); @@ -412,6 +408,10 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. PM.add(createInferAddressSpacesPass()); + + // This should run after inlining to have any chance of doing anything, + // and before other cleanup optimizations. + PM.add(createAMDGPULowerKernelAttributesPass()); }); } @@ -449,6 +449,11 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl( return I.get(); } +TargetTransformInfo +R600TargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(R600TTIImpl(this, F)); +} + //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// @@ -461,7 +466,7 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL, bool JIT) : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} -const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { +const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { StringRef GPU = getGPUName(F); StringRef FS = getFeatureString(F); @@ -474,7 +479,7 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); + I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); } I->setScalarizeGlobalBehavior(ScalarizeGlobal); @@ -482,6 +487,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { return I.get(); } +TargetTransformInfo +GCNTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(GCNTTIImpl(this, F)); +} + //===----------------------------------------------------------------------===// // AMDGPU Pass Setup //===----------------------------------------------------------------------===// @@ -571,11 +581,6 @@ public: } // end anonymous namespace -TargetTransformInfo -AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) { - return TargetTransformInfo(AMDGPUTTIImpl(this, F)); -} - void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createGVNPass()); @@ -584,6 +589,7 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { } void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { + addPass(createLICMPass()); addPass(createSeparateConstOffsetFromGEPPass()); addPass(createSpeculativeExecutionPass()); // ReassociateGEPs exposes more opportunites for SLSR. See @@ -629,7 +635,8 @@ void AMDGPUPassConfig::addIRPasses() { } // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. - addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + if (TM.getTargetTriple().getArch() == Triple::r600) + addPass(createR600OpenCLImageTypeLoweringPass()); // Replace OpenCL enqueued block function pointers with global variables. addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); @@ -672,6 +679,10 @@ void AMDGPUPassConfig::addIRPasses() { } void AMDGPUPassConfig::addCodeGenPrepare() { + if (TM->getTargetTriple().getArch() == Triple::amdgcn && + EnableLowerKernelArguments) + addPass(createAMDGPULowerKernelArgumentsPass()); + TargetPassConfig::addCodeGenPrepare(); if (EnableLoadStoreVectorizer) @@ -739,7 +750,7 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( MachineSchedContext *C) const { - const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); if (ST.enableSIScheduler()) return createSIMachineScheduler(C); return createGCNMaxOccupancyMachineScheduler(C); @@ -782,7 +793,7 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&SILoadStoreOptimizerID); if (EnableSDWAPeephole) { addPass(&SIPeepholeSDWAID); - addPass(&MachineLICMID); + addPass(&EarlyMachineLICMID); addPass(&MachineCSEID); addPass(&SIFoldOperandsID); addPass(&DeadMachineInstructionElimID); @@ -851,6 +862,8 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); + insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID); + // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. @@ -873,6 +886,10 @@ void GCNPassConfig::addPreSched2() { } void GCNPassConfig::addPreEmitPass() { + addPass(createSIMemoryLegalizerPass()); + addPass(createSIInsertWaitcntsPass()); + addPass(createSIShrinkInstructionsPass()); + // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there // are multiple scheduling regions in a basic block, the regions are scheduled @@ -881,15 +898,12 @@ void GCNPassConfig::addPreEmitPass() { // // Here we add a stand-alone hazard recognizer pass which can handle all // cases. + // + // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would + // be better for it to emit S_NOP <N> when possible. addPass(&PostRAHazardRecognizerID); - if (EnableSIInsertWaitcntsPass) - addPass(createSIInsertWaitcntsPass()); - else - addPass(createSIInsertWaitsPass()); - addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); - addPass(createSIMemoryLegalizerPass()); addPass(createSIDebuggerInsertNopsPass()); addPass(&BranchRelaxationPassID); } |
