summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp252
1 files changed, 201 insertions, 51 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b4b10835837c..ce7c82e2a88a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -15,45 +15,40 @@
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
-#include "AMDGPUCallLowering.h"
#include "AMDGPUExportClustering.h"
-#include "AMDGPUInstructionSelector.h"
-#include "AMDGPULegalizerInfo.h"
#include "AMDGPUMacroFusion.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600MachineScheduler.h"
#include "SIMachineFunctionInfo.h"
#include "SIMachineScheduler.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MIRParser/MIParser.h"
-#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/IPO/Internalize.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
#include "llvm/Transforms/Vectorize.h"
-#include <memory>
using namespace llvm;
@@ -216,7 +211,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
- initializeSIFixupVectorISelPass(*PR);
initializeSIFoldOperandsPass(*PR);
initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
@@ -237,6 +231,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
+ initializeAMDGPULateCodeGenPreparePass(*PR);
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
initializeAMDGPUPropagateAttributesLatePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -260,7 +255,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUExternalAAWrapperPass(*PR);
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
- initializeAMDGPUInlinerPass(*PR);
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeGCNRegBankReassignPass(*PR);
initializeGCNNSAReassignPass(*PR);
@@ -284,7 +278,6 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@@ -295,7 +288,6 @@ createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
@@ -309,7 +301,6 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_ILP);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
return DAG;
}
@@ -345,15 +336,15 @@ GCNILPSchedRegistry("gcn-ilp",
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
- return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
+ return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
}
// 32-bit private, local, and region pointers. 64-bit global, constant and
// flat, non-integral buffer fat pointers.
- return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
+ return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
"-ni:7";
}
@@ -402,16 +393,14 @@ AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
- return GPUAttr.hasAttribute(Attribute::None) ?
- getTargetCPU() : GPUAttr.getValueAsString();
+ return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
}
StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
Attribute FSAttr = F.getFnAttribute("target-features");
- return FSAttr.hasAttribute(Attribute::None) ?
- getTargetFeatureString() :
- FSAttr.getValueAsString();
+ return FSAttr.isValid() ? FSAttr.getValueAsString()
+ : getTargetFeatureString();
}
/// Predicate for Internalize pass.
@@ -433,7 +422,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
if (EnableFunctionCalls) {
delete Builder.Inliner;
- Builder.Inliner = createAMDGPUFunctionInliningPass();
+ Builder.Inliner = createFunctionInliningPass();
}
Builder.addExtension(
@@ -487,6 +476,133 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
});
}
+void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
+ AAM.registerFunctionAnalysis<AMDGPUAA>();
+}
+
+void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
+ bool DebugPassManager) {
+ PB.registerPipelineParsingCallback(
+ [this](StringRef PassName, ModulePassManager &PM,
+ ArrayRef<PassBuilder::PipelineElement>) {
+ if (PassName == "amdgpu-propagate-attributes-late") {
+ PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
+ return true;
+ }
+ if (PassName == "amdgpu-unify-metadata") {
+ PM.addPass(AMDGPUUnifyMetadataPass());
+ return true;
+ }
+ if (PassName == "amdgpu-printf-runtime-binding") {
+ PM.addPass(AMDGPUPrintfRuntimeBindingPass());
+ return true;
+ }
+ if (PassName == "amdgpu-always-inline") {
+ PM.addPass(AMDGPUAlwaysInlinePass());
+ return true;
+ }
+ return false;
+ });
+ PB.registerPipelineParsingCallback(
+ [this](StringRef PassName, FunctionPassManager &PM,
+ ArrayRef<PassBuilder::PipelineElement>) {
+ if (PassName == "amdgpu-simplifylib") {
+ PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+ return true;
+ }
+ if (PassName == "amdgpu-usenative") {
+ PM.addPass(AMDGPUUseNativeCallsPass());
+ return true;
+ }
+ if (PassName == "amdgpu-promote-alloca") {
+ PM.addPass(AMDGPUPromoteAllocaPass(*this));
+ return true;
+ }
+ if (PassName == "amdgpu-promote-alloca-to-vector") {
+ PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
+ return true;
+ }
+ if (PassName == "amdgpu-lower-kernel-attributes") {
+ PM.addPass(AMDGPULowerKernelAttributesPass());
+ return true;
+ }
+ if (PassName == "amdgpu-propagate-attributes-early") {
+ PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
+ return true;
+ }
+
+ return false;
+ });
+
+ PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
+ FAM.registerPass([&] { return AMDGPUAA(); });
+ });
+
+ PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
+ if (AAName == "amdgpu-aa") {
+ AAM.registerFunctionAnalysis<AMDGPUAA>();
+ return true;
+ }
+ return false;
+ });
+
+ PB.registerPipelineStartEPCallback([this, DebugPassManager](
+ ModulePassManager &PM,
+ PassBuilder::OptimizationLevel Level) {
+ FunctionPassManager FPM(DebugPassManager);
+ FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
+ FPM.addPass(AMDGPUUseNativeCallsPass());
+ if (EnableLibCallSimplify && Level != PassBuilder::OptimizationLevel::O0)
+ FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+ PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+ });
+
+ PB.registerPipelineEarlySimplificationEPCallback(
+ [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
+ if (Level == PassBuilder::OptimizationLevel::O0)
+ return;
+
+ PM.addPass(AMDGPUUnifyMetadataPass());
+ PM.addPass(AMDGPUPrintfRuntimeBindingPass());
+
+ if (InternalizeSymbols) {
+ PM.addPass(InternalizePass(mustPreserveGV));
+ }
+ PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
+ if (InternalizeSymbols) {
+ PM.addPass(GlobalDCEPass());
+ }
+ if (EarlyInlineAll && !EnableFunctionCalls)
+ PM.addPass(AMDGPUAlwaysInlinePass());
+ });
+
+ PB.registerCGSCCOptimizerLateEPCallback(
+ [this, DebugPassManager](CGSCCPassManager &PM,
+ PassBuilder::OptimizationLevel Level) {
+ if (Level == PassBuilder::OptimizationLevel::O0)
+ return;
+
+ FunctionPassManager FPM(DebugPassManager);
+
+ // Add infer address spaces pass to the opt pipeline after inlining
+ // but before SROA to increase SROA opportunities.
+ FPM.addPass(InferAddressSpacesPass());
+
+ // This should run after inlining to have any chance of doing
+ // anything, and before other cleanup optimizations.
+ FPM.addPass(AMDGPULowerKernelAttributesPass());
+
+ if (Level != PassBuilder::OptimizationLevel::O0) {
+ // Promote alloca to vector before SROA and loop unroll. If we
+ // manage to eliminate allocas before unroll we may choose to unroll
+ // less.
+ FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
+ }
+
+ PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
+ });
+}
+
//===----------------------------------------------------------------------===//
// R600 Target Machine (R600 -> Cayman)
//===----------------------------------------------------------------------===//
@@ -526,6 +642,39 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
return I.get();
}
+int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
+ return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS)
+ ? -1
+ : 0;
+}
+
+bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
+ AMDGPU::isFlatGlobalAddrSpace(DestAS);
+}
+
+unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
+ const auto *LD = dyn_cast<LoadInst>(V);
+ if (!LD)
+ return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+
+ // It must be a generic pointer loaded.
+ assert(V->getType()->isPointerTy() &&
+ V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
+
+ const auto *Ptr = LD->getPointerOperand();
+ if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+ return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+ // For a generic pointer loaded from the constant memory, it could be assumed
+ // as a global pointer since the constant memory is only populated on the
+ // host side. As implied by the offload programming model, only global
+ // pointers could be referenced on the host side.
+ return AMDGPUAS::GLOBAL_ADDRESS;
+}
+
TargetTransformInfo
R600TargetMachine::getTargetTransformInfo(const Function &F) {
return TargetTransformInfo(R600TTIImpl(this, F));
@@ -593,7 +742,6 @@ public:
createMachineScheduler(MachineSchedContext *C) const override {
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
@@ -866,6 +1014,7 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
+ addPass(createAMDGPULateCodeGenPreparePass());
if (EnableAtomicOptimizations) {
addPass(createAMDGPUAtomicOptimizerPass());
}
@@ -930,19 +1079,12 @@ bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesID);
addPass(createSILowerI1CopiesPass());
- // TODO: We have to add FinalizeISel
- // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel
- // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded.
- // Will be removed as soon as SIFixupVectorISel is changed
- // to work with V_ADD/SUB_U64_PSEUDO instead.
- addPass(&FinalizeISelID);
- addPass(createSIFixupVectorISelPass());
addPass(createSIAddIMGInitPass());
return false;
}
bool GCNPassConfig::addIRTranslator() {
- addPass(new IRTranslator());
+ addPass(new IRTranslator(getOptLevel()));
return false;
}
@@ -969,6 +1111,10 @@ bool GCNPassConfig::addRegBankSelect() {
bool GCNPassConfig::addGlobalInstructionSelect() {
addPass(new InstructionSelect());
+ // TODO: Fix instruction selection to do the right thing for image
+ // instructions with tfe or lwe in the first place, instead of running a
+ // separate pass to fix them up?
+ addPass(createSIAddIMGInitPass());
return false;
}
@@ -976,7 +1122,6 @@ void GCNPassConfig::addPreRegAlloc() {
if (LateCFGStructurize) {
addPass(createAMDGPUMachineCFGStructurizerPass());
}
- addPass(createSIWholeQuadModePass());
}
void GCNPassConfig::addFastRegAlloc() {
@@ -988,13 +1133,18 @@ void GCNPassConfig::addFastRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
- // This must be run just after RegisterCoalescing.
- insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
+ insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
+ insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
TargetPassConfig::addFastRegAlloc();
}
void GCNPassConfig::addOptimizedRegAlloc() {
+ // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
+ // instructions that cause scheduling barriers.
+ insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
+ insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
+
if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
@@ -1004,9 +1154,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
- // This must be run just after RegisterCoalescing.
- insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
-
if (EnableDCEInRA)
insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
@@ -1041,6 +1188,12 @@ void GCNPassConfig::addPreEmitPass() {
addPass(createSIShrinkInstructionsPass());
addPass(createSIModeRegisterPass());
+ if (getOptLevel() > CodeGenOpt::None)
+ addPass(&SIInsertHardClausesID);
+
+ addPass(&SIRemoveShortExecBranchesID);
+ addPass(&SIInsertSkipsPassID);
+ addPass(&SIPreEmitPeepholeID);
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
// are multiple scheduling regions in a basic block, the regions are scheduled
@@ -1049,16 +1202,7 @@ void GCNPassConfig::addPreEmitPass() {
//
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
- //
- // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
- // be better for it to emit S_NOP <N> when possible.
addPass(&PostRAHazardRecognizerID);
- if (getOptLevel() > CodeGenOpt::None)
- addPass(&SIInsertHardClausesID);
-
- addPass(&SIRemoveShortExecBranchesID);
- addPass(&SIInsertSkipsPassID);
- addPass(&SIPreEmitPeepholeID);
addPass(&BranchRelaxationPassID);
}
@@ -1087,6 +1231,12 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
MFI->initializeBaseYamlFields(YamlMFI);
+ if (MFI->Occupancy == 0) {
+ // Fixup the subtarget dependent default value.
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
+ }
+
auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
Register TempReg;
if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {