summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp203
1 files changed, 144 insertions, 59 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dc868f010d85c..6984f4e716130 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -31,6 +31,7 @@
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
@@ -39,7 +40,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -117,10 +117,23 @@ static cl::opt<bool> EnableSIInsertWaitcntsPass(
cl::init(true));
// Option to run late CFG structurizer
-static cl::opt<bool> LateCFGStructurize(
+static cl::opt<bool, true> LateCFGStructurize(
"amdgpu-late-structurize",
cl::desc("Enable late CFG structurization"),
- cl::init(false),
+ cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
+ cl::Hidden);
+
+static cl::opt<bool> EnableAMDGPUFunctionCalls(
+ "amdgpu-function-calls",
+ cl::Hidden,
+ cl::desc("Enable AMDGPU function call support"),
+ cl::init(false));
+
+// Enable lib calls simplifications
+static cl::opt<bool> EnableLibCallSimplify(
+ "amdgpu-simplify-libcall",
+ cl::desc("Enable mdgpu library simplifications"),
+ cl::init(true),
cl::Hidden);
extern "C" void LLVMInitializeAMDGPUTarget() {
@@ -129,20 +142,29 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
PassRegistry *PR = PassRegistry::getPassRegistry();
+ initializeR600ClauseMergePassPass(*PR);
+ initializeR600ControlFlowFinalizerPass(*PR);
+ initializeR600PacketizerPass(*PR);
+ initializeR600ExpandSpecialInstrsPassPass(*PR);
+ initializeR600VectorRegMergerPass(*PR);
+ initializeAMDGPUDAGToDAGISelPass(*PR);
initializeSILowerI1CopiesPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
initializeSIFoldOperandsPass(*PR);
initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
- initializeSIFixControlFlowLiveIntervalsPass(*PR);
+ initializeSIOptimizeExecMaskingPreRAPass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
+ initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
+ initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
+ initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
initializeSIInsertWaitsPass(*PR);
@@ -150,10 +172,15 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);
+ initializeSIMemoryLegalizerPass(*PR);
initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
+ initializeSIFixWWMLivenessPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
+ initializeAMDGPUUseNativeCallsPass(*PR);
+ initializeAMDGPUSimplifyLibCallsPass(*PR);
+ initializeAMDGPUInlinerPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -192,6 +219,16 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
}
+static ScheduleDAGInstrs *
+createIterativeILPMachineScheduler(MachineSchedContext *C) {
+ auto DAG = new GCNIterativeScheduler(C,
+ GCNIterativeScheduler::SCHEDULE_ILP);
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+ return DAG;
+}
+
static MachineSchedRegistry
R600SchedRegistry("r600", "Run R600's custom scheduler",
createR600MachineScheduler);
@@ -215,9 +252,18 @@ GCNMinRegSchedRegistry("gcn-minreg",
"Run GCN iterative scheduler for minimal register usage (experimental)",
createMinRegScheduler);
+static MachineSchedRegistry
+GCNILPSchedRegistry("gcn-ilp",
+ "Run GCN iterative scheduler for ILP scheduling (experimental)",
+ createIterativeILPMachineScheduler);
+
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
+ if (TT.getEnvironmentName() == "amdgiz" ||
+ TT.getEnvironmentName() == "amdgizcl")
+ return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
}
@@ -239,9 +285,8 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
if (!GPU.empty())
return GPU;
- // HSA only supports CI+, so change the default GPU to a CI for HSA.
if (TT.getArch() == Triple::amdgcn)
- return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
+ return "generic";
return "r600";
}
@@ -252,21 +297,30 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
return Reloc::PIC_;
}
+static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
+ if (CM)
+ return *CM;
+ return CodeModel::Small;
+}
+
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
TargetOptions Options,
Optional<Reloc::Model> RM,
- CodeModel::Model CM,
+ Optional<CodeModel::Model> CM,
CodeGenOpt::Level OptLevel)
- : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
- FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
- TLOF(createTLOF(getTargetTriple())) {
+ : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
+ FS, Options, getEffectiveRelocModel(RM),
+ getEffectiveCodeModel(CM), OptLevel),
+ TLOF(createTLOF(getTargetTriple())) {
AS = AMDGPU::getAMDGPUAS(TT);
initAsmInfo();
}
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
+bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
+
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
return GPUAttr.hasAttribute(Attribute::None) ?
@@ -288,15 +342,38 @@ static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
});
}
+/// Predicate for Internalize pass.
+static bool mustPreserveGV(const GlobalValue &GV) {
+ if (const Function *F = dyn_cast<Function>(&GV))
+ return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
+
+ return !GV.use_empty();
+}
+
void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
Builder.DivergentTarget = true;
- bool Internalize = InternalizeSymbols &&
- (getOptLevel() > CodeGenOpt::None) &&
- (getTargetTriple().getArch() == Triple::amdgcn);
- bool EarlyInline = EarlyInlineAll &&
- (getOptLevel() > CodeGenOpt::None);
- bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
+ bool EnableOpt = getOptLevel() > CodeGenOpt::None;
+ bool Internalize = InternalizeSymbols;
+ bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
+ bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
+ bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
+
+ if (EnableAMDGPUFunctionCalls) {
+ delete Builder.Inliner;
+ Builder.Inliner = createAMDGPUFunctionInliningPass();
+ }
+
+ if (Internalize) {
+ // If we're generating code, we always have the whole program available. The
+ // relocations expected for externally visible functions aren't supported,
+ // so make sure every non-entry function is hidden.
+ Builder.addExtension(
+ PassManagerBuilder::EP_EnabledOnOptLevel0,
+ [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ PM.add(createInternalizePass(mustPreserveGV));
+ });
+ }
Builder.addExtension(
PassManagerBuilder::EP_ModuleOptimizerEarly,
@@ -308,38 +385,25 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
}
PM.add(createAMDGPUUnifyMetadataPass());
if (Internalize) {
- PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool {
- if (const Function *F = dyn_cast<Function>(&GV)) {
- if (F->isDeclaration())
- return true;
- switch (F->getCallingConv()) {
- default:
- return false;
- case CallingConv::AMDGPU_VS:
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_GS:
- case CallingConv::AMDGPU_PS:
- case CallingConv::AMDGPU_CS:
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::SPIR_KERNEL:
- return true;
- }
- }
- return !GV.use_empty();
- }));
+ PM.add(createInternalizePass(mustPreserveGV));
PM.add(createGlobalDCEPass());
}
if (EarlyInline)
PM.add(createAMDGPUAlwaysInlinePass(false));
});
+ const auto &Opt = Options;
Builder.addExtension(
PassManagerBuilder::EP_EarlyAsPossible,
- [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
+ legacy::PassManagerBase &PM) {
if (AMDGPUAA) {
PM.add(createAMDGPUAAWrapperPass());
PM.add(createAMDGPUExternalAAWrapperPass());
}
+ PM.add(llvm::createAMDGPUUseNativeCallsPass());
+ if (LibCallSimplify)
+ PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
});
Builder.addExtension(
@@ -359,8 +423,9 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
TargetOptions Options,
Optional<Reloc::Model> RM,
- CodeModel::Model CM, CodeGenOpt::Level OL)
- : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
+ Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT)
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
setRequiresStructuredCFG(true);
}
@@ -392,8 +457,9 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
TargetOptions Options,
Optional<Reloc::Model> RM,
- CodeModel::Model CM, CodeGenOpt::Level OL)
- : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+ Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT)
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
StringRef GPU = getGPUName(F);
@@ -464,6 +530,7 @@ public:
}
bool addPreISel() override;
+ bool addInstSelector() override;
void addPreRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
@@ -472,7 +539,12 @@ public:
class GCNPassConfig final : public AMDGPUPassConfig {
public:
GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
- : AMDGPUPassConfig(TM, PM) {}
+ : AMDGPUPassConfig(TM, PM) {
+ // It is necessary to know the register usage of the entire call graph. We
+ // allow calls without EnableAMDGPUFunctionCalls if they are marked
+ // noinline, so this is always required.
+ setRequiresCodeGenSCCOrder(true);
+ }
GCNTargetMachine &getGCNTargetMachine() const {
return getTM<GCNTargetMachine>();
@@ -485,12 +557,10 @@ public:
void addMachineSSAOptimization() override;
bool addILPOpts() override;
bool addInstSelector() override;
-#ifdef LLVM_BUILD_GLOBAL_ISEL
bool addIRTranslator() override;
bool addLegalizeMachineIR() override;
bool addRegBankSelect() override;
bool addGlobalInstructionSelect() override;
-#endif
void addFastRegAlloc(FunctionPass *RegAllocPass) override;
void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
void addPreRegAlloc() override;
@@ -540,15 +610,18 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAMDGPULowerIntrinsicsPass());
- // Function calls are not supported, so make sure we inline everything.
- addPass(createAMDGPUAlwaysInlinePass());
- addPass(createAlwaysInlinerLegacyPass());
- // We need to add the barrier noop pass, otherwise adding the function
- // inlining pass will cause all of the PassConfigs passes to be run
- // one function at a time, which means if we have a nodule with two
- // functions, then we will generate code for the first function
- // without ever running any passes on the second.
- addPass(createBarrierNoopPass());
+ if (TM.getTargetTriple().getArch() == Triple::r600 ||
+ !EnableAMDGPUFunctionCalls) {
+ // Function calls are not supported, so make sure we inline everything.
+ addPass(createAMDGPUAlwaysInlinePass());
+ addPass(createAlwaysInlinerLegacyPass());
+ // We need to add the barrier noop pass, otherwise adding the function
+ // inlining pass will cause all of the PassConfigs passes to be run
+ // one function at a time, which means if we have a nodule with two
+ // functions, then we will generate code for the first function
+ // without ever running any passes on the second.
+ addPass(createBarrierNoopPass());
+ }
if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
// TODO: May want to move later or split into an early and late one.
@@ -559,6 +632,9 @@ void AMDGPUPassConfig::addIRPasses() {
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+ // Replace OpenCL enqueued block function pointers with global variables.
+ addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
+
if (TM.getOptLevel() > CodeGenOpt::None) {
addPass(createInferAddressSpacesPass());
addPass(createAMDGPUPromoteAlloca());
@@ -609,7 +685,7 @@ bool AMDGPUPassConfig::addPreISel() {
}
bool AMDGPUPassConfig::addInstSelector() {
- addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
+ addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
return false;
}
@@ -630,6 +706,11 @@ bool R600PassConfig::addPreISel() {
return false;
}
+bool R600PassConfig::addInstSelector() {
+ addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
+ return false;
+}
+
void R600PassConfig::addPreRegAlloc() {
addPass(createR600VectorRegMerger());
}
@@ -725,7 +806,6 @@ bool GCNPassConfig::addInstSelector() {
return false;
}
-#ifdef LLVM_BUILD_GLOBAL_ISEL
bool GCNPassConfig::addIRTranslator() {
addPass(new IRTranslator());
return false;
@@ -746,8 +826,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
return false;
}
-#endif
-
void GCNPassConfig::addPreRegAlloc() {
if (LateCFGStructurize) {
addPass(createAMDGPUMachineCFGStructurizerPass());
@@ -764,19 +842,25 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+ // This must be run after SILowerControlFlow, since it needs to use the
+ // machine-level CFG, but before register allocation.
+ insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
TargetPassConfig::addFastRegAlloc(RegAllocPass);
}
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
- // This needs to be run directly before register allocation because earlier
- // passes might recompute live intervals.
- insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
+ insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+ // This must be run after SILowerControlFlow, since it needs to use the
+ // machine-level CFG, but before register allocation.
+ insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}
@@ -806,6 +890,7 @@ void GCNPassConfig::addPreEmitPass() {
addPass(createSIInsertWaitsPass());
addPass(createSIShrinkInstructionsPass());
addPass(&SIInsertSkipsPassID);
+ addPass(createSIMemoryLegalizerPass());
addPass(createSIDebuggerInsertNopsPass());
addPass(&BranchRelaxationPassID);
}