diff options
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 203 |
1 files changed, 144 insertions, 59 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index dc868f010d85c..6984f4e716130 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -31,6 +31,7 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -39,7 +40,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" @@ -117,10 +117,23 @@ static cl::opt<bool> EnableSIInsertWaitcntsPass( cl::init(true)); // Option to run late CFG structurizer -static cl::opt<bool> LateCFGStructurize( +static cl::opt<bool, true> LateCFGStructurize( "amdgpu-late-structurize", cl::desc("Enable late CFG structurization"), - cl::init(false), + cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), + cl::Hidden); + +static cl::opt<bool> EnableAMDGPUFunctionCalls( + "amdgpu-function-calls", + cl::Hidden, + cl::desc("Enable AMDGPU function call support"), + cl::init(false)); + +// Enable lib calls simplifications +static cl::opt<bool> EnableLibCallSimplify( + "amdgpu-simplify-libcall", + cl::desc("Enable mdgpu library simplifications"), + cl::init(true), cl::Hidden); extern "C" void LLVMInitializeAMDGPUTarget() { @@ -129,20 +142,29 @@ extern "C" void LLVMInitializeAMDGPUTarget() { RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); PassRegistry *PR = PassRegistry::getPassRegistry(); + initializeR600ClauseMergePassPass(*PR); + initializeR600ControlFlowFinalizerPass(*PR); + initializeR600PacketizerPass(*PR); + initializeR600ExpandSpecialInstrsPassPass(*PR); + initializeR600VectorRegMergerPass(*PR); + initializeAMDGPUDAGToDAGISelPass(*PR); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); - initializeSIFixControlFlowLiveIntervalsPass(*PR); + initializeSIOptimizeExecMaskingPreRAPass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); + initializeAMDGPUArgumentUsageInfoPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); + initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); + initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitsPass(*PR); @@ -150,10 +172,15 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); + initializeSIMemoryLegalizerPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); + initializeSIFixWWMLivenessPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); + initializeAMDGPUUseNativeCallsPass(*PR); + initializeAMDGPUSimplifyLibCallsPass(*PR); + initializeAMDGPUInlinerPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -192,6 +219,16 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { GCNIterativeScheduler::SCHEDULE_MINREGFORCED); } +static ScheduleDAGInstrs * +createIterativeILPMachineScheduler(MachineSchedContext *C) { + auto DAG = new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_ILP); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); + return DAG; +} + static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); @@ -215,9 +252,18 @@ GCNMinRegSchedRegistry("gcn-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler); +static MachineSchedRegistry +GCNILPSchedRegistry("gcn-ilp", + "Run GCN iterative scheduler for ILP scheduling (experimental)", + createIterativeILPMachineScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. + if (TT.getEnvironmentName() == "amdgiz" || + TT.getEnvironmentName() == "amdgizcl") + return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } @@ -239,9 +285,8 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { if (!GPU.empty()) return GPU; - // HSA only supports CI+, so change the default GPU to a CI for HSA. if (TT.getArch() == Triple::amdgcn) - return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; + return "generic"; return "r600"; } @@ -252,21 +297,30 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { return Reloc::PIC_; } +static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) { + if (CM) + return *CM; + return CodeModel::Small; +} + AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, + Optional<CodeModel::Model> CM, CodeGenOpt::Level OptLevel) - : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), - FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), - TLOF(createTLOF(getTargetTriple())) { + : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), + FS, Options, getEffectiveRelocModel(RM), + getEffectiveCodeModel(CM), OptLevel), + TLOF(createTLOF(getTargetTriple())) { AS = AMDGPU::getAMDGPUAS(TT); initAsmInfo(); } AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; +bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; + StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { Attribute GPUAttr = F.getFnAttribute("target-cpu"); return GPUAttr.hasAttribute(Attribute::None) ? @@ -288,15 +342,38 @@ static ImmutablePass *createAMDGPUExternalAAWrapperPass() { }); } +/// Predicate for Internalize pass. +static bool mustPreserveGV(const GlobalValue &GV) { + if (const Function *F = dyn_cast<Function>(&GV)) + return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv()); + + return !GV.use_empty(); +} + void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { Builder.DivergentTarget = true; - bool Internalize = InternalizeSymbols && - (getOptLevel() > CodeGenOpt::None) && - (getTargetTriple().getArch() == Triple::amdgcn); - bool EarlyInline = EarlyInlineAll && - (getOptLevel() > CodeGenOpt::None); - bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None; + bool EnableOpt = getOptLevel() > CodeGenOpt::None; + bool Internalize = InternalizeSymbols; + bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls; + bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; + bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; + + if (EnableAMDGPUFunctionCalls) { + delete Builder.Inliner; + Builder.Inliner = createAMDGPUFunctionInliningPass(); + } + + if (Internalize) { + // If we're generating code, we always have the whole program available. The + // relocations expected for externally visible functions aren't supported, + // so make sure every non-entry function is hidden. + Builder.addExtension( + PassManagerBuilder::EP_EnabledOnOptLevel0, + [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + PM.add(createInternalizePass(mustPreserveGV)); + }); + } Builder.addExtension( PassManagerBuilder::EP_ModuleOptimizerEarly, @@ -308,38 +385,25 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { } PM.add(createAMDGPUUnifyMetadataPass()); if (Internalize) { - PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool { - if (const Function *F = dyn_cast<Function>(&GV)) { - if (F->isDeclaration()) - return true; - switch (F->getCallingConv()) { - default: - return false; - case CallingConv::AMDGPU_VS: - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - case CallingConv::AMDGPU_PS: - case CallingConv::AMDGPU_CS: - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - return true; - } - } - return !GV.use_empty(); - })); + PM.add(createInternalizePass(mustPreserveGV)); PM.add(createGlobalDCEPass()); } if (EarlyInline) PM.add(createAMDGPUAlwaysInlinePass(false)); }); + const auto &Opt = Options; Builder.addExtension( PassManagerBuilder::EP_EarlyAsPossible, - [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { if (AMDGPUAA) { PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); } + PM.add(llvm::createAMDGPUUseNativeCallsPass()); + if (LibCallSimplify) + PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt)); }); Builder.addExtension( @@ -359,8 +423,9 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { + Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT) + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { setRequiresStructuredCFG(true); } @@ -392,8 +457,9 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT) + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { StringRef GPU = getGPUName(F); @@ -464,6 +530,7 @@ public: } bool addPreISel() override; + bool addInstSelector() override; void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; @@ -472,7 +539,12 @@ public: class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) - : AMDGPUPassConfig(TM, PM) {} + : AMDGPUPassConfig(TM, PM) { + // It is necessary to know the register usage of the entire call graph. We + // allow calls without EnableAMDGPUFunctionCalls if they are marked + // noinline, so this is always required. + setRequiresCodeGenSCCOrder(true); + } GCNTargetMachine &getGCNTargetMachine() const { return getTM<GCNTargetMachine>(); @@ -485,12 +557,10 @@ public: void addMachineSSAOptimization() override; bool addILPOpts() override; bool addInstSelector() override; -#ifdef LLVM_BUILD_GLOBAL_ISEL bool addIRTranslator() override; bool addLegalizeMachineIR() override; bool addRegBankSelect() override; bool addGlobalInstructionSelect() override; -#endif void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addPreRegAlloc() override; @@ -540,15 +610,18 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAMDGPULowerIntrinsicsPass()); - // Function calls are not supported, so make sure we inline everything. - addPass(createAMDGPUAlwaysInlinePass()); - addPass(createAlwaysInlinerLegacyPass()); - // We need to add the barrier noop pass, otherwise adding the function - // inlining pass will cause all of the PassConfigs passes to be run - // one function at a time, which means if we have a nodule with two - // functions, then we will generate code for the first function - // without ever running any passes on the second. - addPass(createBarrierNoopPass()); + if (TM.getTargetTriple().getArch() == Triple::r600 || + !EnableAMDGPUFunctionCalls) { + // Function calls are not supported, so make sure we inline everything. + addPass(createAMDGPUAlwaysInlinePass()); + addPass(createAlwaysInlinerLegacyPass()); + // We need to add the barrier noop pass, otherwise adding the function + // inlining pass will cause all of the PassConfigs passes to be run + // one function at a time, which means if we have a nodule with two + // functions, then we will generate code for the first function + // without ever running any passes on the second. + addPass(createBarrierNoopPass()); + } if (TM.getTargetTriple().getArch() == Triple::amdgcn) { // TODO: May want to move later or split into an early and late one. @@ -559,6 +632,9 @@ void AMDGPUPassConfig::addIRPasses() { // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + // Replace OpenCL enqueued block function pointers with global variables. + addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); + if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createInferAddressSpacesPass()); addPass(createAMDGPUPromoteAlloca()); @@ -609,7 +685,7 @@ bool AMDGPUPassConfig::addPreISel() { } bool AMDGPUPassConfig::addInstSelector() { - addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel())); + addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); return false; } @@ -630,6 +706,11 @@ bool R600PassConfig::addPreISel() { return false; } +bool R600PassConfig::addInstSelector() { + addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel())); + return false; +} + void R600PassConfig::addPreRegAlloc() { addPass(createR600VectorRegMerger()); } @@ -725,7 +806,6 @@ bool GCNPassConfig::addInstSelector() { return false; } -#ifdef LLVM_BUILD_GLOBAL_ISEL bool GCNPassConfig::addIRTranslator() { addPass(new IRTranslator()); return false; @@ -746,8 +826,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() { return false; } -#endif - void GCNPassConfig::addPreRegAlloc() { if (LateCFGStructurize) { addPass(createAMDGPUMachineCFGStructurizerPass()); @@ -764,19 +842,25 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); + // This must be run after SILowerControlFlow, since it needs to use the + // machine-level CFG, but before register allocation. + insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + TargetPassConfig::addFastRegAlloc(RegAllocPass); } void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - // This needs to be run directly before register allocation because earlier - // passes might recompute live intervals. - insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); + insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); + // This must be run after SILowerControlFlow, since it needs to use the + // machine-level CFG, but before register allocation. + insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } @@ -806,6 +890,7 @@ void GCNPassConfig::addPreEmitPass() { addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); + addPass(createSIMemoryLegalizerPass()); addPass(createSIDebuggerInsertNopsPass()); addPass(&BranchRelaxationPassID); } |
