summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp102
1 files changed, 58 insertions, 44 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2042dbf6d5e2..2205819c444f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief The AMDGPU target machine contains all of the hardware specific
+/// The AMDGPU target machine contains all of the hardware specific
/// information needed to emit code for R600 and SI GPUs.
//
//===----------------------------------------------------------------------===//
@@ -31,7 +31,6 @@
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
@@ -40,6 +39,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -79,7 +79,7 @@ static cl::opt<bool> EnableLoadStoreVectorizer(
cl::init(true),
cl::Hidden);
-// Option to to control global loads scalarization
+// Option to control global loads scalarization
static cl::opt<bool> ScalarizeGlobal(
"amdgpu-scalarize-global-loads",
cl::desc("Enable global load scalarization"),
@@ -110,12 +110,6 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
cl::desc("Enable AMDGPU Alias Analysis"),
cl::init(true));
-// Option to enable new waitcnt insertion pass.
-static cl::opt<bool> EnableSIInsertWaitcntsPass(
- "enable-si-insert-waitcnts",
- cl::desc("Use new waitcnt insertion pass"),
- cl::init(true));
-
// Option to run late CFG structurizer
static cl::opt<bool, true> LateCFGStructurize(
"amdgpu-late-structurize",
@@ -123,16 +117,23 @@ static cl::opt<bool, true> LateCFGStructurize(
cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
cl::Hidden);
-static cl::opt<bool> EnableAMDGPUFunctionCalls(
+static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
"amdgpu-function-calls",
- cl::Hidden,
cl::desc("Enable AMDGPU function call support"),
- cl::init(false));
+ cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
+ cl::init(false),
+ cl::Hidden);
// Enable lib calls simplifications
static cl::opt<bool> EnableLibCallSimplify(
"amdgpu-simplify-libcall",
- cl::desc("Enable mdgpu library simplifications"),
+ cl::desc("Enable amdgpu library simplifications"),
+ cl::init(true),
+ cl::Hidden);
+
+static cl::opt<bool> EnableLowerKernelArguments(
+ "amdgpu-ir-lower-kernel-arguments",
+ cl::desc("Lower kernel argument loads in IR pass"),
cl::init(true),
cl::Hidden);
@@ -147,6 +148,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeR600PacketizerPass(*PR);
initializeR600ExpandSpecialInstrsPassPass(*PR);
initializeR600VectorRegMergerPass(*PR);
+ initializeGlobalISel(*PR);
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeSILowerI1CopiesPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
@@ -160,6 +162,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
+ initializeAMDGPULowerKernelArgumentsPass(*PR);
+ initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
@@ -167,7 +171,6 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
- initializeSIInsertWaitsPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
@@ -176,6 +179,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
initializeSIFixWWMLivenessPass(*PR);
+ initializeSIFormMemoryClausesPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUUseNativeCallsPass(*PR);
@@ -260,24 +264,15 @@ GCNILPSchedRegistry("gcn-ilp",
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
- if (TT.getEnvironmentName() == "amdgiz" ||
- TT.getEnvironmentName() == "amdgizcl")
return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
- return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
}
// 32-bit private, local, and region pointers. 64-bit global, constant and
// flat.
- if (TT.getEnvironmentName() == "amdgiz" ||
- TT.getEnvironmentName() == "amdgizcl")
- return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
+ return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
- return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
- "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
}
LLVM_READNONE
@@ -317,9 +312,10 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
initAsmInfo();
}
-AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
-
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
+bool AMDGPUTargetMachine::EnableFunctionCalls = false;
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
@@ -412,6 +408,10 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
PM.add(createInferAddressSpacesPass());
+
+ // This should run after inlining to have any chance of doing anything,
+ // and before other cleanup optimizations.
+ PM.add(createAMDGPULowerKernelAttributesPass());
});
}
@@ -449,6 +449,11 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
return I.get();
}
+TargetTransformInfo
+R600TargetMachine::getTargetTransformInfo(const Function &F) {
+ return TargetTransformInfo(R600TTIImpl(this, F));
+}
+
//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
@@ -461,7 +466,7 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OL, bool JIT)
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
StringRef GPU = getGPUName(F);
StringRef FS = getFeatureString(F);
@@ -474,7 +479,7 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+ I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
}
I->setScalarizeGlobalBehavior(ScalarizeGlobal);
@@ -482,6 +487,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
return I.get();
}
+TargetTransformInfo
+GCNTargetMachine::getTargetTransformInfo(const Function &F) {
+ return TargetTransformInfo(GCNTTIImpl(this, F));
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU Pass Setup
//===----------------------------------------------------------------------===//
@@ -571,11 +581,6 @@ public:
} // end anonymous namespace
-TargetTransformInfo
-AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) {
- return TargetTransformInfo(AMDGPUTTIImpl(this, F));
-}
-
void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
if (getOptLevel() == CodeGenOpt::Aggressive)
addPass(createGVNPass());
@@ -584,6 +589,7 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
}
void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+ addPass(createLICMPass());
addPass(createSeparateConstOffsetFromGEPPass());
addPass(createSpeculativeExecutionPass());
// ReassociateGEPs exposes more opportunites for SLSR. See
@@ -629,7 +635,8 @@ void AMDGPUPassConfig::addIRPasses() {
}
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
- addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+ if (TM.getTargetTriple().getArch() == Triple::r600)
+ addPass(createR600OpenCLImageTypeLoweringPass());
// Replace OpenCL enqueued block function pointers with global variables.
addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
@@ -672,6 +679,10 @@ void AMDGPUPassConfig::addIRPasses() {
}
void AMDGPUPassConfig::addCodeGenPrepare() {
+ if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
+ EnableLowerKernelArguments)
+ addPass(createAMDGPULowerKernelArgumentsPass());
+
TargetPassConfig::addCodeGenPrepare();
if (EnableLoadStoreVectorizer)
@@ -739,7 +750,7 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
MachineSchedContext *C) const {
- const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);
return createGCNMaxOccupancyMachineScheduler(C);
@@ -782,7 +793,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
addPass(&SILoadStoreOptimizerID);
if (EnableSDWAPeephole) {
addPass(&SIPeepholeSDWAID);
- addPass(&MachineLICMID);
+ addPass(&EarlyMachineLICMID);
addPass(&MachineCSEID);
addPass(&SIFoldOperandsID);
addPass(&DeadMachineInstructionElimID);
@@ -851,6 +862,8 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
+ insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
+
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
@@ -873,6 +886,10 @@ void GCNPassConfig::addPreSched2() {
}
void GCNPassConfig::addPreEmitPass() {
+ addPass(createSIMemoryLegalizerPass());
+ addPass(createSIInsertWaitcntsPass());
+ addPass(createSIShrinkInstructionsPass());
+
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
// are multiple scheduling regions in a basic block, the regions are scheduled
@@ -881,15 +898,12 @@ void GCNPassConfig::addPreEmitPass() {
//
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
+ //
+ // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
+ // be better for it to emit S_NOP <N> when possible.
addPass(&PostRAHazardRecognizerID);
- if (EnableSIInsertWaitcntsPass)
- addPass(createSIInsertWaitcntsPass());
- else
- addPass(createSIInsertWaitsPass());
- addPass(createSIShrinkInstructionsPass());
addPass(&SIInsertSkipsPassID);
- addPass(createSIMemoryLegalizerPass());
addPass(createSIDebuggerInsertNopsPass());
addPass(&BranchRelaxationPassID);
}