diff options
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 230 |
1 files changed, 209 insertions, 21 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d8a0c716279c..0202220b8011 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -15,24 +15,29 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" +#include "AMDGPUAliasAnalysis.h" #include "AMDGPUCallLowering.h" +#include "AMDGPUInstructionSelector.h" +#include "AMDGPULegalizerInfo.h" +#ifdef LLVM_BUILD_GLOBAL_ISEL +#include "AMDGPURegisterBankInfo.h" +#endif #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" +#include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" #include "R600MachineScheduler.h" #include "SIMachineScheduler.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Triple.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" -#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize.h" @@ -58,6 +63,11 @@ static cl::opt<bool> EnableSROA( cl::ReallyHidden, cl::init(true)); +static cl::opt<bool> +EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, + cl::desc("Run early if-conversion"), + cl::init(false)); + static cl::opt<bool> EnableR600IfConvert( "r600-if-convert", cl::desc("Use if conversion pass"), @@ -78,6 +88,36 @@ static cl::opt<bool> ScalarizeGlobal( cl::init(false), cl::Hidden); +// Option to run internalize pass. +static cl::opt<bool> InternalizeSymbols( + "amdgpu-internalize-symbols", + cl::desc("Enable elimination of non-kernel functions and unused globals"), + cl::init(false), + cl::Hidden); + +// Option to inline all early. +static cl::opt<bool> EarlyInlineAll( + "amdgpu-early-inline-all", + cl::desc("Inline all functions early"), + cl::init(false), + cl::Hidden); + +static cl::opt<bool> EnableSDWAPeephole( + "amdgpu-sdwa-peephole", + cl::desc("Enable SDWA peepholer"), + cl::init(true)); + +// Enable address space based alias analysis +static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, + cl::desc("Enable AMDGPU Alias Analysis"), + cl::init(true)); + +// Option to enable new waitcnt insertion pass. +static cl::opt<bool> EnableSIInsertWaitcntsPass( + "enable-si-insert-waitcnts", + cl::desc("Use new waitcnt insertion pass"), + cl::init(false)); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); @@ -86,22 +126,28 @@ extern "C" void LLVMInitializeAMDGPUTarget() { PassRegistry *PR = PassRegistry::getPassRegistry(); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); + initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); + initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); + initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitsPass(*PR); + initializeSIInsertWaitcntsPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); + initializeAMDGPUUnifyDivergentExitNodesPass(*PR); + initializeAMDGPUAAWrapperPassPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -119,13 +165,26 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = - new ScheduleDAGMILive(C, - llvm::make_unique<GCNMaxOccupancySchedStrategy>(C)); + new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } +static ScheduleDAGInstrs * +createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + auto DAG = new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + return DAG; +} + +static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { + return new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_MINREGFORCED); +} + static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); @@ -139,6 +198,16 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler); +static MachineSchedRegistry +IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", + "Run GCN scheduler to maximize occupancy (experimental)", + createIterativeGCNMaxOccupancyMachineScheduler); + +static MachineSchedRegistry +GCNMinRegSchedRegistry("gcn-minreg", + "Run GCN iterative scheduler for minimal register usage (experimental)", + createMinRegScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. @@ -148,9 +217,14 @@ static StringRef computeDataLayout(const Triple &TT) { // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + if (TT.getEnvironmentName() == "amdgiz" || + TT.getEnvironmentName() == "amdgizcl") + return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } LLVM_READNONE @@ -180,6 +254,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), TLOF(createTLOF(getTargetTriple())) { + AS = AMDGPU::getAMDGPUAS(TT); initAsmInfo(); } @@ -199,8 +274,65 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { FSAttr.getValueAsString(); } -void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) { - PM.add(createAMDGPUUnifyMetadataPass()); +static ImmutablePass *createAMDGPUExternalAAWrapperPass() { + return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { + if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + }); +} + +void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { + Builder.DivergentTarget = true; + + bool Internalize = InternalizeSymbols && + (getOptLevel() > CodeGenOpt::None) && + (getTargetTriple().getArch() == Triple::amdgcn); + bool EarlyInline = EarlyInlineAll && + (getOptLevel() > CodeGenOpt::None); + bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None; + + Builder.addExtension( + PassManagerBuilder::EP_ModuleOptimizerEarly, + [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { + if (AMDGPUAA) { + PM.add(createAMDGPUAAWrapperPass()); + PM.add(createAMDGPUExternalAAWrapperPass()); + } + PM.add(createAMDGPUUnifyMetadataPass()); + if (Internalize) { + PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool { + if (const Function *F = dyn_cast<Function>(&GV)) { + if (F->isDeclaration()) + return true; + switch (F->getCallingConv()) { + default: + return false; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + } + } + return !GV.use_empty(); + })); + PM.add(createGlobalDCEPass()); + } + if (EarlyInline) + PM.add(createAMDGPUAlwaysInlinePass(false)); + }); + + Builder.addExtension( + PassManagerBuilder::EP_EarlyAsPossible, + [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + if (AMDGPUAA) { + PM.add(createAMDGPUAAWrapperPass()); + PM.add(createAMDGPUExternalAAWrapperPass()); + } + }); } //===----------------------------------------------------------------------===// @@ -245,9 +377,21 @@ namespace { struct SIGISelActualAccessor : public GISelAccessor { std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; + std::unique_ptr<InstructionSelector> InstSelector; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; const AMDGPUCallLowering *getCallLowering() const override { return CallLoweringInfo.get(); } + const InstructionSelector *getInstructionSelector() const override { + return InstSelector.get(); + } + const LegalizerInfo *getLegalizerInfo() const override { + return Legalizer.get(); + } + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); + } }; } // end anonymous namespace @@ -281,6 +425,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); GISel->CallLoweringInfo.reset( new AMDGPUCallLowering(*I->getTargetLowering())); + GISel->Legalizer.reset(new AMDGPULegalizerInfo()); + + GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*I->getRegisterInfo())); + GISel->InstSelector.reset(new AMDGPUInstructionSelector(*I, + *static_cast<AMDGPURegisterBankInfo*>(GISel->RegBankInfo.get()))); #endif I->setGISelAccessor(*GISel); @@ -356,9 +505,9 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override; - void addIRPasses() override; bool addPreISel() override; void addMachineSSAOptimization() override; + bool addILPOpts() override; bool addInstSelector() override; #ifdef LLVM_BUILD_GLOBAL_ISEL bool addIRTranslator() override; @@ -406,11 +555,15 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { } void AMDGPUPassConfig::addIRPasses() { + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + // There is no reason to run these. disablePass(&StackMapLivenessID); disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + addPass(createAMDGPULowerIntrinsicsPass(&TM)); + // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerLegacyPass()); @@ -421,17 +574,33 @@ void AMDGPUPassConfig::addIRPasses() { // without ever running any passes on the second. addPass(createBarrierNoopPass()); + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { + // TODO: May want to move later or split into an early and late one. + + addPass(createAMDGPUCodeGenPreparePass( + static_cast<const GCNTargetMachine *>(&TM))); + } + // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); - const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); if (TM.getOptLevel() > CodeGenOpt::None) { + addPass(createInferAddressSpacesPass()); addPass(createAMDGPUPromoteAlloca(&TM)); if (EnableSROA) addPass(createSROAPass()); addStraightLineScalarOptimizationPasses(); + + if (EnableAMDGPUAliasAnalysis) { + addPass(createAMDGPUAAWrapperPass()); + addPass(createExternalAAWrapperPass([](Pass &P, Function &, + AAResults &AAR) { + if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + })); + } } TargetPassConfig::addIRPasses(); @@ -526,7 +695,12 @@ bool GCNPassConfig::addPreISel() { // FIXME: We need to run a pass to propagate the attributes when calls are // supported. - addPass(&AMDGPUAnnotateKernelFeaturesID); + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM)); + + // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit + // regions formed by them. + addPass(&AMDGPUUnifyDivergentExitNodesID); addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions addPass(createSinkingPass()); addPass(createSITypeRewriter()); @@ -549,13 +723,19 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&SIFoldOperandsID); addPass(&DeadMachineInstructionElimID); addPass(&SILoadStoreOptimizerID); + addPass(createSIShrinkInstructionsPass()); + if (EnableSDWAPeephole) { + addPass(&SIPeepholeSDWAID); + addPass(&DeadMachineInstructionElimID); + } } -void GCNPassConfig::addIRPasses() { - // TODO: May want to move later or split into an early and late one. - addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); +bool GCNPassConfig::addILPOpts() { + if (EnableEarlyIfConversion) + addPass(&EarlyIfConverterID); - AMDGPUPassConfig::addIRPasses(); + TargetPassConfig::addILPOpts(); + return false; } bool GCNPassConfig::addInstSelector() { @@ -572,20 +752,23 @@ bool GCNPassConfig::addIRTranslator() { } bool GCNPassConfig::addLegalizeMachineIR() { + addPass(new Legalizer()); return false; } bool GCNPassConfig::addRegBankSelect() { + addPass(new RegBankSelect()); return false; } bool GCNPassConfig::addGlobalInstructionSelect() { + addPass(new InstructionSelect()); return false; } + #endif void GCNPassConfig::addPreRegAlloc() { - addPass(createSIShrinkInstructionsPass()); addPass(createSIWholeQuadModePass()); } @@ -615,6 +798,7 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { } void GCNPassConfig::addPostRegAlloc() { + addPass(&SIFixVGPRCopiesID); addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); } @@ -633,7 +817,10 @@ void GCNPassConfig::addPreEmitPass() { // cases. addPass(&PostRAHazardRecognizerID); - addPass(createSIInsertWaitsPass()); + if (EnableSIInsertWaitcntsPass) + addPass(createSIInsertWaitcntsPass()); + else + addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); addPass(createSIDebuggerInsertNopsPass()); @@ -643,3 +830,4 @@ void GCNPassConfig::addPreEmitPass() { TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { return new GCNPassConfig(this, PM); } + |