diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 113 |
1 files changed, 87 insertions, 26 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index eb30d659bf0b5..b4b10835837cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" #include "AMDGPUCallLowering.h" +#include "AMDGPUExportClustering.h" #include "AMDGPUInstructionSelector.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPUMacroFusion.h" @@ -23,6 +24,7 @@ #include "AMDGPUTargetTransformInfo.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "R600MachineScheduler.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" @@ -30,6 +32,7 @@ #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/Passes.h" @@ -138,6 +141,13 @@ static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt( cl::init(true), cl::Hidden); +static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt( + "amdgpu-fixed-function-abi", + cl::desc("Enable all implicit function arguments"), + cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI), + cl::init(false), + cl::Hidden); + // Enable lib calls simplifications static cl::opt<bool> EnableLibCallSimplify( "amdgpu-simplify-libcall", @@ -183,6 +193,11 @@ static cl::opt<bool> EnableScalarIRPasses( cl::init(true), cl::Hidden); +static cl::opt<bool> EnableStructurizerWorkarounds( + "amdgpu-enable-structurizer-workarounds", + cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), + cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); @@ -217,23 +232,29 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); + initializeAMDGPUPostLegalizerCombinerPass(*PR); + initializeAMDGPUPreLegalizerCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); + initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); + initializeSIInsertHardClausesPass(*PR); initializeSIInsertWaitcntsPass(*PR); initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIRemoveShortExecBranchesPass(*PR); + initializeSIPreEmitPeepholePass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); + initializeSIPostRABundlerPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); @@ -243,6 +264,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); + initializeSIAddIMGInitPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -264,6 +286,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); + DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; } @@ -363,10 +386,17 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), TLOF(createTLOF(getTargetTriple())) { initAsmInfo(); + if (TT.getArch() == Triple::amdgcn) { + if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) + MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); + else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) + MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); + } } bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; +bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -416,20 +446,19 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { } PM.add(createAMDGPUUnifyMetadataPass()); PM.add(createAMDGPUPrintfRuntimeBinding()); - PM.add(createAMDGPUPropagateAttributesLatePass(this)); - if (Internalize) { + if (Internalize) PM.add(createInternalizePass(mustPreserveGV)); + PM.add(createAMDGPUPropagateAttributesLatePass(this)); + if (Internalize) PM.add(createGlobalDCEPass()); - } if (EarlyInline) PM.add(createAMDGPUAlwaysInlinePass(false)); }); - const auto &Opt = Options; Builder.addExtension( PassManagerBuilder::EP_EarlyAsPossible, - [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &, - legacy::PassManagerBase &PM) { + [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { if (AMDGPUAA) { PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); @@ -437,12 +466,12 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); PM.add(llvm::createAMDGPUUseNativeCallsPass()); if (LibCallSimplify) - PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this)); + PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); }); Builder.addExtension( PassManagerBuilder::EP_CGSCCOptimizerLate, - [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) { // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. PM.add(createInferAddressSpacesPass()); @@ -450,6 +479,11 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { // This should run after inlining to have any chance of doing anything, // and before other cleanup optimizations. PM.add(createAMDGPULowerKernelAttributesPass()); + + // Promote alloca to vector before SROA and loop unroll. If we manage + // to eliminate allocas before unroll we may choose to unroll less. + if (EnableOpt) + PM.add(createAMDGPUPromoteAllocaToVector()); }); } @@ -617,7 +651,9 @@ public: bool addILPOpts() override; bool addInstSelector() override; bool addIRTranslator() override; + void addPreLegalizeMachineIR() override; bool addLegalizeMachineIR() override; + void addPreRegBankSelect() override; bool addRegBankSelect() override; bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; @@ -751,10 +787,15 @@ void AMDGPUPassConfig::addCodeGenPrepare() { if (EnableLoadStoreVectorizer) addPass(createLoadStoreVectorizerPass()); + + // LowerSwitch pass may introduce unreachable blocks that can + // cause unexpected behavior for subsequent passes. Placing it + // here seems better that these blocks would get cleaned up by + // UnreachableBlockElim inserted next in the pass flow. + addPass(createLowerSwitchPass()); } bool AMDGPUPassConfig::addPreISel() { - addPass(createLowerSwitchPass()); addPass(createFlattenCFGPass()); return false; } @@ -836,7 +877,11 @@ bool GCNPassConfig::addPreISel() { // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); if (!LateCFGStructurize) { - addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions + if (EnableStructurizerWorkarounds) { + addPass(createFixIrreduciblePass()); + addPass(createUnifyLoopExitsPass()); + } + addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions } addPass(createSinkingPass()); addPass(createAMDGPUAnnotateUniformValues()); @@ -885,6 +930,12 @@ bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(&SIFixSGPRCopiesID); addPass(createSILowerI1CopiesPass()); + // TODO: We have to add FinalizeISel + // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel + // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded. + // Will be removed as soon as SIFixupVectorISel is changed + // to work with V_ADD/SUB_U64_PSEUDO instead. + addPass(&FinalizeISelID); addPass(createSIFixupVectorISelPass()); addPass(createSIAddIMGInitPass()); return false; @@ -895,11 +946,22 @@ bool GCNPassConfig::addIRTranslator() { return false; } +void GCNPassConfig::addPreLegalizeMachineIR() { + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); + addPass(new Localizer()); +} + bool GCNPassConfig::addLegalizeMachineIR() { addPass(new Legalizer()); return false; } +void GCNPassConfig::addPreRegBankSelect() { + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); +} + bool GCNPassConfig::addRegBankSelect() { addPass(new RegBankSelect()); return false; @@ -933,12 +995,9 @@ void GCNPassConfig::addFastRegAlloc() { } void GCNPassConfig::addOptimizedRegAlloc() { - if (OptExecMaskPreRA) { + if (OptExecMaskPreRA) insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); - insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID); - } else { - insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); - } + insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of @@ -973,6 +1032,7 @@ void GCNPassConfig::addPostRegAlloc() { } void GCNPassConfig::addPreSched2() { + addPass(&SIPostRABundlerID); } void GCNPassConfig::addPreEmitPass() { @@ -993,9 +1053,12 @@ void GCNPassConfig::addPreEmitPass() { // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would // be better for it to emit S_NOP <N> when possible. addPass(&PostRAHazardRecognizerID); + if (getOptLevel() > CodeGenOpt::None) + addPass(&SIInsertHardClausesID); addPass(&SIRemoveShortExecBranchesID); addPass(&SIInsertSkipsPassID); + addPass(&SIPreEmitPeepholeID); addPass(&BranchRelaxationPassID); } @@ -1024,11 +1087,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->initializeBaseYamlFields(YamlMFI); - auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) { - if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) { + auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { + Register TempReg; + if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { SourceRange = RegName.SourceRange; return true; } + RegVal = TempReg; return false; }; @@ -1046,7 +1111,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo( }; if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || - parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) || parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) return true; @@ -1056,11 +1120,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo( return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); } - if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG && - !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) { - return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg); - } - if (MFI->FrameOffsetReg != AMDGPU::FP_REG && !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); @@ -1080,7 +1139,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo( return false; if (A->IsRegister) { - unsigned Reg; + Register Reg; if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { SourceRange = A->RegisterName.SourceRange; return true; @@ -1154,8 +1213,10 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->Mode.IEEE = YamlMFI.Mode.IEEE; MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; - MFI->Mode.FP32Denormals = YamlMFI.Mode.FP32Denormals; - MFI->Mode.FP64FP16Denormals = YamlMFI.Mode.FP64FP16Denormals; + MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; + MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; + MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; + MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; return false; } |