summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp113
1 files changed, 87 insertions, 26 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index eb30d659bf0b5..b4b10835837cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -16,6 +16,7 @@
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUCallLowering.h"
+#include "AMDGPUExportClustering.h"
#include "AMDGPUInstructionSelector.h"
#include "AMDGPULegalizerInfo.h"
#include "AMDGPUMacroFusion.h"
@@ -23,6 +24,7 @@
#include "AMDGPUTargetTransformInfo.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600MachineScheduler.h"
#include "SIMachineFunctionInfo.h"
#include "SIMachineScheduler.h"
@@ -30,6 +32,7 @@
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/Passes.h"
@@ -138,6 +141,13 @@ static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
cl::init(true),
cl::Hidden);
+static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
+ "amdgpu-fixed-function-abi",
+ cl::desc("Enable all implicit function arguments"),
+ cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
+ cl::init(false),
+ cl::Hidden);
+
// Enable lib calls simplifications
static cl::opt<bool> EnableLibCallSimplify(
"amdgpu-simplify-libcall",
@@ -183,6 +193,11 @@ static cl::opt<bool> EnableScalarIRPasses(
cl::init(true),
cl::Hidden);
+static cl::opt<bool> EnableStructurizerWorkarounds(
+ "amdgpu-enable-structurizer-workarounds",
+ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
+ cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -217,23 +232,29 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
+ initializeAMDGPUPostLegalizerCombinerPass(*PR);
+ initializeAMDGPUPreLegalizerCombinerPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
+ initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
initializeAMDGPUPropagateAttributesLatePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
+ initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIModeRegisterPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIRemoveShortExecBranchesPass(*PR);
+ initializeSIPreEmitPeepholePass(*PR);
initializeSIInsertSkipsPass(*PR);
initializeSIMemoryLegalizerPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
initializeSIPreAllocateWWMRegsPass(*PR);
initializeSIFormMemoryClausesPass(*PR);
+ initializeSIPostRABundlerPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUExternalAAWrapperPass(*PR);
@@ -243,6 +264,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeGCNRegBankReassignPass(*PR);
initializeGCNNSAReassignPass(*PR);
+ initializeSIAddIMGInitPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -264,6 +286,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+ DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
}
@@ -363,10 +386,17 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
TLOF(createTLOF(getTargetTriple())) {
initAsmInfo();
+ if (TT.getArch() == Triple::amdgcn) {
+ if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
+ MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
+ else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
+ MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
+ }
}
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
bool AMDGPUTargetMachine::EnableFunctionCalls = false;
+bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
@@ -416,20 +446,19 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
}
PM.add(createAMDGPUUnifyMetadataPass());
PM.add(createAMDGPUPrintfRuntimeBinding());
- PM.add(createAMDGPUPropagateAttributesLatePass(this));
- if (Internalize) {
+ if (Internalize)
PM.add(createInternalizePass(mustPreserveGV));
+ PM.add(createAMDGPUPropagateAttributesLatePass(this));
+ if (Internalize)
PM.add(createGlobalDCEPass());
- }
if (EarlyInline)
PM.add(createAMDGPUAlwaysInlinePass(false));
});
- const auto &Opt = Options;
Builder.addExtension(
PassManagerBuilder::EP_EarlyAsPossible,
- [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
- legacy::PassManagerBase &PM) {
+ [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
+ legacy::PassManagerBase &PM) {
if (AMDGPUAA) {
PM.add(createAMDGPUAAWrapperPass());
PM.add(createAMDGPUExternalAAWrapperPass());
@@ -437,12 +466,12 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
PM.add(llvm::createAMDGPUUseNativeCallsPass());
if (LibCallSimplify)
- PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
+ PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
});
Builder.addExtension(
PassManagerBuilder::EP_CGSCCOptimizerLate,
- [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
PM.add(createInferAddressSpacesPass());
@@ -450,6 +479,11 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
// This should run after inlining to have any chance of doing anything,
// and before other cleanup optimizations.
PM.add(createAMDGPULowerKernelAttributesPass());
+
+ // Promote alloca to vector before SROA and loop unroll. If we manage
+ // to eliminate allocas before unroll we may choose to unroll less.
+ if (EnableOpt)
+ PM.add(createAMDGPUPromoteAllocaToVector());
});
}
@@ -617,7 +651,9 @@ public:
bool addILPOpts() override;
bool addInstSelector() override;
bool addIRTranslator() override;
+ void addPreLegalizeMachineIR() override;
bool addLegalizeMachineIR() override;
+ void addPreRegBankSelect() override;
bool addRegBankSelect() override;
bool addGlobalInstructionSelect() override;
void addFastRegAlloc() override;
@@ -751,10 +787,15 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
if (EnableLoadStoreVectorizer)
addPass(createLoadStoreVectorizerPass());
+
+ // LowerSwitch pass may introduce unreachable blocks that can
+ // cause unexpected behavior for subsequent passes. Placing it
+ // here seems better that these blocks would get cleaned up by
+ // UnreachableBlockElim inserted next in the pass flow.
+ addPass(createLowerSwitchPass());
}
bool AMDGPUPassConfig::addPreISel() {
- addPass(createLowerSwitchPass());
addPass(createFlattenCFGPass());
return false;
}
@@ -836,7 +877,11 @@ bool GCNPassConfig::addPreISel() {
// regions formed by them.
addPass(&AMDGPUUnifyDivergentExitNodesID);
if (!LateCFGStructurize) {
- addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+ if (EnableStructurizerWorkarounds) {
+ addPass(createFixIrreduciblePass());
+ addPass(createUnifyLoopExitsPass());
+ }
+ addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
}
addPass(createSinkingPass());
addPass(createAMDGPUAnnotateUniformValues());
@@ -885,6 +930,12 @@ bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesID);
addPass(createSILowerI1CopiesPass());
+ // TODO: We have to add FinalizeISel
+ // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel
+ // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded.
+ // Will be removed as soon as SIFixupVectorISel is changed
+ // to work with V_ADD/SUB_U64_PSEUDO instead.
+ addPass(&FinalizeISelID);
addPass(createSIFixupVectorISelPass());
addPass(createSIAddIMGInitPass());
return false;
@@ -895,11 +946,22 @@ bool GCNPassConfig::addIRTranslator() {
return false;
}
+void GCNPassConfig::addPreLegalizeMachineIR() {
+ bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
+ addPass(new Localizer());
+}
+
bool GCNPassConfig::addLegalizeMachineIR() {
addPass(new Legalizer());
return false;
}
+void GCNPassConfig::addPreRegBankSelect() {
+ bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
+}
+
bool GCNPassConfig::addRegBankSelect() {
addPass(new RegBankSelect());
return false;
@@ -933,12 +995,9 @@ void GCNPassConfig::addFastRegAlloc() {
}
void GCNPassConfig::addOptimizedRegAlloc() {
- if (OptExecMaskPreRA) {
+ if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
- insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
- } else {
- insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
- }
+ insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
@@ -973,6 +1032,7 @@ void GCNPassConfig::addPostRegAlloc() {
}
void GCNPassConfig::addPreSched2() {
+ addPass(&SIPostRABundlerID);
}
void GCNPassConfig::addPreEmitPass() {
@@ -993,9 +1053,12 @@ void GCNPassConfig::addPreEmitPass() {
// FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
// be better for it to emit S_NOP <N> when possible.
addPass(&PostRAHazardRecognizerID);
+ if (getOptLevel() > CodeGenOpt::None)
+ addPass(&SIInsertHardClausesID);
addPass(&SIRemoveShortExecBranchesID);
addPass(&SIInsertSkipsPassID);
+ addPass(&SIPreEmitPeepholeID);
addPass(&BranchRelaxationPassID);
}
@@ -1024,11 +1087,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
MFI->initializeBaseYamlFields(YamlMFI);
- auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) {
- if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) {
+ auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
+ Register TempReg;
+ if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
SourceRange = RegName.SourceRange;
return true;
}
+ RegVal = TempReg;
return false;
};
@@ -1046,7 +1111,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
};
if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
- parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) ||
parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
return true;
@@ -1056,11 +1120,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
}
- if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG &&
- !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) {
- return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg);
- }
-
if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
!AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
@@ -1080,7 +1139,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
return false;
if (A->IsRegister) {
- unsigned Reg;
+ Register Reg;
if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
SourceRange = A->RegisterName.SourceRange;
return true;
@@ -1154,8 +1213,10 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
- MFI->Mode.FP32Denormals = YamlMFI.Mode.FP32Denormals;
- MFI->Mode.FP64FP16Denormals = YamlMFI.Mode.FP64FP16Denormals;
+ MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
+ MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
+ MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
+ MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;
return false;
}