aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp230
1 files changed, 209 insertions, 21 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d8a0c716279c..0202220b8011 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -15,24 +15,29 @@
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
+#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUCallLowering.h"
+#include "AMDGPUInstructionSelector.h"
+#include "AMDGPULegalizerInfo.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "AMDGPURegisterBankInfo.h"
+#endif
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
+#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
#include "R600MachineScheduler.h"
#include "SIMachineScheduler.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
-#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Vectorize.h"
@@ -58,6 +63,11 @@ static cl::opt<bool> EnableSROA(
cl::ReallyHidden,
cl::init(true));
+static cl::opt<bool>
+EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
+ cl::desc("Run early if-conversion"),
+ cl::init(false));
+
static cl::opt<bool> EnableR600IfConvert(
"r600-if-convert",
cl::desc("Use if conversion pass"),
@@ -78,6 +88,36 @@ static cl::opt<bool> ScalarizeGlobal(
cl::init(false),
cl::Hidden);
+// Option to run internalize pass.
+static cl::opt<bool> InternalizeSymbols(
+ "amdgpu-internalize-symbols",
+ cl::desc("Enable elimination of non-kernel functions and unused globals"),
+ cl::init(false),
+ cl::Hidden);
+
+// Option to inline all early.
+static cl::opt<bool> EarlyInlineAll(
+ "amdgpu-early-inline-all",
+ cl::desc("Inline all functions early"),
+ cl::init(false),
+ cl::Hidden);
+
+static cl::opt<bool> EnableSDWAPeephole(
+ "amdgpu-sdwa-peephole",
+ cl::desc("Enable SDWA peepholer"),
+ cl::init(true));
+
+// Enable address space based alias analysis
+static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
+ cl::desc("Enable AMDGPU Alias Analysis"),
+ cl::init(true));
+
+// Option to enable new waitcnt insertion pass.
+static cl::opt<bool> EnableSIInsertWaitcntsPass(
+ "enable-si-insert-waitcnts",
+ cl::desc("Use new waitcnt insertion pass"),
+ cl::init(false));
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -86,22 +126,28 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
PassRegistry *PR = PassRegistry::getPassRegistry();
initializeSILowerI1CopiesPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
+ initializeSIFixVGPRCopiesPass(*PR);
initializeSIFoldOperandsPass(*PR);
+ initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
initializeSIFixControlFlowLiveIntervalsPass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
+ initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
initializeSIInsertWaitsPass(*PR);
+ initializeSIInsertWaitcntsPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);
initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
+ initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
+ initializeAMDGPUAAWrapperPassPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -119,13 +165,26 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
- new ScheduleDAGMILive(C,
- llvm::make_unique<GCNMaxOccupancySchedStrategy>(C));
+ new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
+static ScheduleDAGInstrs *
+createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+ auto DAG = new GCNIterativeScheduler(C,
+ GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ return DAG;
+}
+
+static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
+ return new GCNIterativeScheduler(C,
+ GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
+}
+
static MachineSchedRegistry
R600SchedRegistry("r600", "Run R600's custom scheduler",
createR600MachineScheduler);
@@ -139,6 +198,16 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
"Run GCN scheduler to maximize occupancy",
createGCNMaxOccupancyMachineScheduler);
+static MachineSchedRegistry
+IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
+ "Run GCN scheduler to maximize occupancy (experimental)",
+ createIterativeGCNMaxOccupancyMachineScheduler);
+
+static MachineSchedRegistry
+GCNMinRegSchedRegistry("gcn-minreg",
+ "Run GCN iterative scheduler for minimal register usage (experimental)",
+ createMinRegScheduler);
+
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
@@ -148,9 +217,14 @@ static StringRef computeDataLayout(const Triple &TT) {
// 32-bit private, local, and region pointers. 64-bit global, constant and
// flat.
- return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+ if (TT.getEnvironmentName() == "amdgiz" ||
+ TT.getEnvironmentName() == "amdgizcl")
+ return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
+ return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+ "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
}
LLVM_READNONE
@@ -180,6 +254,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
: LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
TLOF(createTLOF(getTargetTriple())) {
+ AS = AMDGPU::getAMDGPUAS(TT);
initAsmInfo();
}
@@ -199,8 +274,65 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
FSAttr.getValueAsString();
}
-void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
- PM.add(createAMDGPUUnifyMetadataPass());
+static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
+ return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
+ if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+ AAR.addAAResult(WrapperPass->getResult());
+ });
+}
+
+void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
+ Builder.DivergentTarget = true;
+
+ bool Internalize = InternalizeSymbols &&
+ (getOptLevel() > CodeGenOpt::None) &&
+ (getTargetTriple().getArch() == Triple::amdgcn);
+ bool EarlyInline = EarlyInlineAll &&
+ (getOptLevel() > CodeGenOpt::None);
+ bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
+
+ Builder.addExtension(
+ PassManagerBuilder::EP_ModuleOptimizerEarly,
+ [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
+ legacy::PassManagerBase &PM) {
+ if (AMDGPUAA) {
+ PM.add(createAMDGPUAAWrapperPass());
+ PM.add(createAMDGPUExternalAAWrapperPass());
+ }
+ PM.add(createAMDGPUUnifyMetadataPass());
+ if (Internalize) {
+ PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool {
+ if (const Function *F = dyn_cast<Function>(&GV)) {
+ if (F->isDeclaration())
+ return true;
+ switch (F->getCallingConv()) {
+ default:
+ return false;
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ return true;
+ }
+ }
+ return !GV.use_empty();
+ }));
+ PM.add(createGlobalDCEPass());
+ }
+ if (EarlyInline)
+ PM.add(createAMDGPUAlwaysInlinePass(false));
+ });
+
+ Builder.addExtension(
+ PassManagerBuilder::EP_EarlyAsPossible,
+ [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ if (AMDGPUAA) {
+ PM.add(createAMDGPUAAWrapperPass());
+ PM.add(createAMDGPUExternalAAWrapperPass());
+ }
+ });
}
//===----------------------------------------------------------------------===//
@@ -245,9 +377,21 @@ namespace {
struct SIGISelActualAccessor : public GISelAccessor {
std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
const AMDGPUCallLowering *getCallLowering() const override {
return CallLoweringInfo.get();
}
+ const InstructionSelector *getInstructionSelector() const override {
+ return InstSelector.get();
+ }
+ const LegalizerInfo *getLegalizerInfo() const override {
+ return Legalizer.get();
+ }
+ const RegisterBankInfo *getRegBankInfo() const override {
+ return RegBankInfo.get();
+ }
};
} // end anonymous namespace
@@ -281,6 +425,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
GISel->CallLoweringInfo.reset(
new AMDGPUCallLowering(*I->getTargetLowering()));
+ GISel->Legalizer.reset(new AMDGPULegalizerInfo());
+
+ GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*I->getRegisterInfo()));
+ GISel->InstSelector.reset(new AMDGPUInstructionSelector(*I,
+ *static_cast<AMDGPURegisterBankInfo*>(GISel->RegBankInfo.get())));
#endif
I->setGISelAccessor(*GISel);
@@ -356,9 +505,9 @@ public:
ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext *C) const override;
- void addIRPasses() override;
bool addPreISel() override;
void addMachineSSAOptimization() override;
+ bool addILPOpts() override;
bool addInstSelector() override;
#ifdef LLVM_BUILD_GLOBAL_ISEL
bool addIRTranslator() override;
@@ -406,11 +555,15 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
}
void AMDGPUPassConfig::addIRPasses() {
+ const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+
// There is no reason to run these.
disablePass(&StackMapLivenessID);
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
+ addPass(createAMDGPULowerIntrinsicsPass(&TM));
+
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());
@@ -421,17 +574,33 @@ void AMDGPUPassConfig::addIRPasses() {
// without ever running any passes on the second.
addPass(createBarrierNoopPass());
+ if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+ // TODO: May want to move later or split into an early and late one.
+
+ addPass(createAMDGPUCodeGenPreparePass(
+ static_cast<const GCNTargetMachine *>(&TM)));
+ }
+
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
addPass(createAMDGPUOpenCLImageTypeLoweringPass());
- const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
if (TM.getOptLevel() > CodeGenOpt::None) {
+ addPass(createInferAddressSpacesPass());
addPass(createAMDGPUPromoteAlloca(&TM));
if (EnableSROA)
addPass(createSROAPass());
addStraightLineScalarOptimizationPasses();
+
+ if (EnableAMDGPUAliasAnalysis) {
+ addPass(createAMDGPUAAWrapperPass());
+ addPass(createExternalAAWrapperPass([](Pass &P, Function &,
+ AAResults &AAR) {
+ if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+ AAR.addAAResult(WrapperPass->getResult());
+ }));
+ }
}
TargetPassConfig::addIRPasses();
@@ -526,7 +695,12 @@ bool GCNPassConfig::addPreISel() {
// FIXME: We need to run a pass to propagate the attributes when calls are
// supported.
- addPass(&AMDGPUAnnotateKernelFeaturesID);
+ const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+ addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM));
+
+ // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
+ // regions formed by them.
+ addPass(&AMDGPUUnifyDivergentExitNodesID);
addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
addPass(createSinkingPass());
addPass(createSITypeRewriter());
@@ -549,13 +723,19 @@ void GCNPassConfig::addMachineSSAOptimization() {
addPass(&SIFoldOperandsID);
addPass(&DeadMachineInstructionElimID);
addPass(&SILoadStoreOptimizerID);
+ addPass(createSIShrinkInstructionsPass());
+ if (EnableSDWAPeephole) {
+ addPass(&SIPeepholeSDWAID);
+ addPass(&DeadMachineInstructionElimID);
+ }
}
-void GCNPassConfig::addIRPasses() {
- // TODO: May want to move later or split into an early and late one.
- addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+bool GCNPassConfig::addILPOpts() {
+ if (EnableEarlyIfConversion)
+ addPass(&EarlyIfConverterID);
- AMDGPUPassConfig::addIRPasses();
+ TargetPassConfig::addILPOpts();
+ return false;
}
bool GCNPassConfig::addInstSelector() {
@@ -572,20 +752,23 @@ bool GCNPassConfig::addIRTranslator() {
}
bool GCNPassConfig::addLegalizeMachineIR() {
+ addPass(new Legalizer());
return false;
}
bool GCNPassConfig::addRegBankSelect() {
+ addPass(new RegBankSelect());
return false;
}
bool GCNPassConfig::addGlobalInstructionSelect() {
+ addPass(new InstructionSelect());
return false;
}
+
#endif
void GCNPassConfig::addPreRegAlloc() {
- addPass(createSIShrinkInstructionsPass());
addPass(createSIWholeQuadModePass());
}
@@ -615,6 +798,7 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
}
void GCNPassConfig::addPostRegAlloc() {
+ addPass(&SIFixVGPRCopiesID);
addPass(&SIOptimizeExecMaskingID);
TargetPassConfig::addPostRegAlloc();
}
@@ -633,7 +817,10 @@ void GCNPassConfig::addPreEmitPass() {
// cases.
addPass(&PostRAHazardRecognizerID);
- addPass(createSIInsertWaitsPass());
+ if (EnableSIInsertWaitcntsPass)
+ addPass(createSIInsertWaitcntsPass());
+ else
+ addPass(createSIInsertWaitsPass());
addPass(createSIShrinkInstructionsPass());
addPass(&SIInsertSkipsPassID);
addPass(createSIDebuggerInsertNopsPass());
@@ -643,3 +830,4 @@ void GCNPassConfig::addPreEmitPass() {
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
return new GCNPassConfig(this, PM);
}
+