diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 212 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 818 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 233 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp | 16 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/R600ISelLowering.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 56 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/VOPCInstructions.td | 2 |
11 files changed, 873 insertions, 488 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index b4a8766d682e..56a9a30bc59a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -29,6 +29,8 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDGPUMemoryUtils.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" @@ -43,6 +45,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/OptimizedStructLayout.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include <tuple> #include <vector> #define DEBUG_TYPE "amdgpu-lower-module-lds" @@ -97,6 +100,9 @@ class AMDGPULowerModuleLDS : public ModulePass { static void removeFromUsedLists(Module &M, const std::vector<GlobalVariable *> &LocalVars) { + // The verifier rejects used lists containing an inttoptr of a constant + // so remove the variables from these lists before replaceAllUsesWith + SmallPtrSet<Constant *, 32> LocalVarsSet; for (GlobalVariable *LocalVar : LocalVars) if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts())) @@ -146,12 +152,59 @@ public: } bool runOnModule(Module &M) override { + LLVMContext &Ctx = M.getContext(); CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); + + // Move variables used by functions into amdgcn.module.lds std::vector<GlobalVariable *> ModuleScopeVariables = AMDGPU::findVariablesToLower(M, nullptr); - Changed |= processUsedLDS(CG, M, ModuleScopeVariables); + if (!ModuleScopeVariables.empty()) { + std::string VarName = "llvm.amdgcn.module.lds"; + + GlobalVariable *SGV; + DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; + std::tie(SGV, LDSVarsToConstantGEP) = + createLDSVariableReplacement(M, VarName, ModuleScopeVariables); + + appendToCompilerUsed( + M, {static_cast<GlobalValue *>( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))}); + + removeFromUsedLists(M, ModuleScopeVariables); + replaceLDSVariablesWithStruct(M, ModuleScopeVariables, SGV, + LDSVarsToConstantGEP, + [](Use &) { return true; }); + + // This ensures the variable is allocated when called functions access it. + // It also lets other passes, specifically PromoteAlloca, accurately + // calculate how much LDS will be used by the kernel after lowering. + + IRBuilder<> Builder(Ctx); + for (Function &Func : M.functions()) { + if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { + const CallGraphNode *N = CG[&Func]; + const bool CalleesRequireModuleLDS = N->size() > 0; + + if (CalleesRequireModuleLDS) { + // If a function this kernel might call requires module LDS, + // annotate the kernel to let later passes know it will allocate + // this structure, even if not apparent from the IR. + markUsedByKernel(Builder, &Func, SGV); + } else { + // However if we are certain this kernel cannot call a function that + // requires module LDS, annotate the kernel so the backend can elide + // the allocation without repeating callgraph walks. + Func.addFnAttr("amdgpu-elide-module-lds"); + } + } + } + + Changed = true; + } + // Move variables used by kernels into per-kernel instances for (Function &F : M.functions()) { if (F.isDeclaration()) continue; @@ -159,9 +212,37 @@ public: // Only lower compute kernels' LDS. if (!AMDGPU::isKernel(F.getCallingConv())) continue; + std::vector<GlobalVariable *> KernelUsedVariables = AMDGPU::findVariablesToLower(M, &F); - Changed |= processUsedLDS(CG, M, KernelUsedVariables, &F); + + // Replace all constant uses with instructions if they belong to the + // current kernel. Unnecessary, removing will cause test churn. + for (size_t I = 0; I < KernelUsedVariables.size(); I++) { + GlobalVariable *GV = KernelUsedVariables[I]; + for (User *U : make_early_inc_range(GV->users())) { + if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) + AMDGPU::replaceConstantUsesInFunction(C, &F); + } + GV->removeDeadConstantUsers(); + } + + if (!KernelUsedVariables.empty()) { + std::string VarName = + (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str(); + GlobalVariable *SGV; + DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; + std::tie(SGV, LDSVarsToConstantGEP) = + createLDSVariableReplacement(M, VarName, KernelUsedVariables); + + removeFromUsedLists(M, KernelUsedVariables); + replaceLDSVariablesWithStruct( + M, KernelUsedVariables, SGV, LDSVarsToConstantGEP, [&F](Use &U) { + Instruction *I = dyn_cast<Instruction>(U.getUser()); + return I && I->getFunction() == &F; + }); + Changed = true; + } } return Changed; @@ -212,16 +293,18 @@ private: return Changed; } - bool processUsedLDS(CallGraph const &CG, Module &M, - std::vector<GlobalVariable *> const &LDSVarsToTransform, - Function *F = nullptr) { + std::tuple<GlobalVariable *, DenseMap<GlobalVariable *, Constant *>> + createLDSVariableReplacement( + Module &M, std::string VarName, + std::vector<GlobalVariable *> const &LDSVarsToTransform) { + // Create a struct instance containing LDSVarsToTransform and map from those + // variables to ConstantExprGEP + // Variables may be introduced to meet alignment requirements. No aliasing + // metadata is useful for these as they have no uses. Erased before return. + LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); - - if (LDSVarsToTransform.empty()) { - // No variables to rewrite, no changes made. - return false; - } + assert(!LDSVarsToTransform.empty()); SmallVector<OptimizedStructLayoutField, 8> LayoutFields; LayoutFields.reserve(LDSVarsToTransform.size()); @@ -234,9 +317,10 @@ private: performOptimizedStructLayout(LayoutFields); std::vector<GlobalVariable *> LocalVars; + BitVector IsPaddingField; LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large + IsPaddingField.reserve(LDSVarsToTransform.size()); { - // This usually won't need to insert any padding, perhaps avoid the alloc uint64_t CurrentOffset = 0; for (size_t I = 0; I < LayoutFields.size(); I++) { GlobalVariable *FGV = static_cast<GlobalVariable *>( @@ -256,10 +340,12 @@ private: M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false)); + IsPaddingField.push_back(true); CurrentOffset += Padding; } LocalVars.push_back(FGV); + IsPaddingField.push_back(false); CurrentOffset += LayoutFields[I].Size; } } @@ -270,9 +356,6 @@ private: LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); - std::string VarName( - F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str() - : "llvm.amdgcn.module.lds"); StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t"); Align StructAlign = @@ -283,62 +366,65 @@ private: VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); SGV->setAlignment(StructAlign); - if (!F) { - appendToCompilerUsed( - M, {static_cast<GlobalValue *>( - ConstantExpr::getPointerBitCastOrAddrSpaceCast( - cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))}); + + DenseMap<GlobalVariable *, Constant *> Map; + Type *I32 = Type::getInt32Ty(Ctx); + for (size_t I = 0; I < LocalVars.size(); I++) { + GlobalVariable *GV = LocalVars[I]; + Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; + Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true); + if (IsPaddingField[I]) { + assert(GV->use_empty()); + GV->eraseFromParent(); + } else { + Map[GV] = GEP; + } } + assert(Map.size() == LDSVarsToTransform.size()); + return {SGV, std::move(Map)}; + } - // The verifier rejects used lists containing an inttoptr of a constant - // so remove the variables from these lists before replaceAllUsesWith - removeFromUsedLists(M, LocalVars); + template <typename PredicateTy> + void replaceLDSVariablesWithStruct( + Module &M, std::vector<GlobalVariable *> const &LDSVarsToTransform, + GlobalVariable *SGV, + DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP, + PredicateTy Predicate) { + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); // Create alias.scope and their lists. Each field in the new structure // does not alias with all other fields. SmallVector<MDNode *> AliasScopes; SmallVector<Metadata *> NoAliasList; - if (LocalVars.size() > 1) { + const size_t NumberVars = LDSVarsToTransform.size(); + if (NumberVars > 1) { MDBuilder MDB(Ctx); - AliasScopes.reserve(LocalVars.size()); + AliasScopes.reserve(NumberVars); MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); - for (size_t I = 0; I < LocalVars.size(); I++) { + for (size_t I = 0; I < NumberVars; I++) { MDNode *Scope = MDB.createAnonymousAliasScope(Domain); AliasScopes.push_back(Scope); } NoAliasList.append(&AliasScopes[1], AliasScopes.end()); } - // Replace uses of ith variable with a constantexpr to the ith field of the - // instance that will be allocated by AMDGPUMachineFunction - Type *I32 = Type::getInt32Ty(Ctx); - for (size_t I = 0; I < LocalVars.size(); I++) { - GlobalVariable *GV = LocalVars[I]; - Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; - Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx); - if (F) { - // Replace all constant uses with instructions if they belong to the - // current kernel. - for (User *U : make_early_inc_range(GV->users())) { - if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) - AMDGPU::replaceConstantUsesInFunction(C, F); - } - - GV->removeDeadConstantUsers(); + // Replace uses of ith variable with a constantexpr to the corresponding + // field of the instance that will be allocated by AMDGPUMachineFunction + for (size_t I = 0; I < NumberVars; I++) { + GlobalVariable *GV = LDSVarsToTransform[I]; + Constant *GEP = LDSVarsToConstantGEP[GV]; - GV->replaceUsesWithIf(GEP, [F](Use &U) { - Instruction *I = dyn_cast<Instruction>(U.getUser()); - return I && I->getFunction() == F; - }); - } else { - GV->replaceAllUsesWith(GEP); - } + GV->replaceUsesWithIf(GEP, Predicate); if (GV->use_empty()) { GV->eraseFromParent(); } - uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I); - Align A = commonAlignment(StructAlign, Off); + APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0); + GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff); + uint64_t Offset = APOff.getZExtValue(); + + Align A = commonAlignment(SGV->getAlign().valueOrOne(), Offset); if (I) NoAliasList[I - 1] = AliasScopes[I - 1]; @@ -349,32 +435,6 @@ private: refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias); } - - // This ensures the variable is allocated when called functions access it. - // It also lets other passes, specifically PromoteAlloca, accurately - // calculate how much LDS will be used by the kernel after lowering. - if (!F) { - IRBuilder<> Builder(Ctx); - for (Function &Func : M.functions()) { - if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { - const CallGraphNode *N = CG[&Func]; - const bool CalleesRequireModuleLDS = N->size() > 0; - - if (CalleesRequireModuleLDS) { - // If a function this kernel might call requires module LDS, - // annotate the kernel to let later passes know it will allocate - // this structure, even if not apparent from the IR. - markUsedByKernel(Builder, &Func, SGV); - } else { - // However if we are certain this kernel cannot call a function that - // requires module LDS, annotate the kernel so the backend can elide - // the allocation without repeating callgraph walks. - Func.addFnAttr("amdgpu-elide-module-lds"); - } - } - } - } - return true; } void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h index 753f7edc9385..98b5031071cf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h @@ -29,7 +29,7 @@ public: virtual ~AMDGPUMIRFormatter() = default; /// Implement target specific parsing of target custom pseudo source value. - virtual bool + bool parseCustomPseudoSourceValue(StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS, const PseudoSourceValue *&PSV, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index bfe2e9b66ed4..98e9907068f2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -191,8 +191,8 @@ public: report_fatal_error("Invalid rule identifier"); } - virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; + bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; }; bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 04da14cc4916..859deae86f35 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -9,6 +9,18 @@ /// \file /// This contains a MachineSchedStrategy implementation for maximizing wave /// occupancy on GCN hardware. +/// +/// This pass will apply multiple scheduling stages to the same function. +/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual +/// entry point for the scheduling of those regions is +/// GCNScheduleDAGMILive::runSchedStages. + +/// Generally, the reason for having multiple scheduling stages is to account +/// for the kernel-wide effect of register usage on occupancy. Usually, only a +/// few scheduling regions will have register pressure high enough to limit +/// occupancy for the kernel, so constraints can be relaxed to improve ILP in +/// other regions. +/// //===----------------------------------------------------------------------===// #include "GCNSchedStrategy.h" @@ -20,9 +32,9 @@ using namespace llvm; GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( - const MachineSchedContext *C) : - GenericScheduler(C), TargetOccupancy(0), HasClusteredNodes(false), - HasExcessPressure(false), MF(nullptr) { } + const MachineSchedContext *C) + : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), + HasClusteredNodes(false), HasExcessPressure(false) {} void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -302,210 +314,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { return SU; } -GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, - std::unique_ptr<MachineSchedStrategy> S) : - ScheduleDAGMILive(C, std::move(S)), - ST(MF.getSubtarget<GCNSubtarget>()), - MFI(*MF.getInfo<SIMachineFunctionInfo>()), - StartingOccupancy(MFI.getOccupancy()), - MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) { +GCNScheduleDAGMILive::GCNScheduleDAGMILive( + MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) + : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), + MFI(*MF.getInfo<SIMachineFunctionInfo>()), + StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); } void GCNScheduleDAGMILive::schedule() { - if (Stage == Collect) { - // Just record regions at the first pass. - Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); - return; - } - - std::vector<MachineInstr*> Unsched; - Unsched.reserve(NumRegionInstrs); - for (auto &I : *this) { - Unsched.push_back(&I); - } - - GCNRegPressure PressureBefore; - if (LIS) { - PressureBefore = Pressure[RegionIdx]; - - LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:"; - GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI); - dbgs() << "Region live-in pressure: "; - llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs()); - dbgs() << "Region register pressure: "; - PressureBefore.print(dbgs())); - } - - GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; - // Set HasClusteredNodes to true for late stages where we have already - // collected it. That way pickNode() will not scan SDep's when not needed. - S.HasClusteredNodes = Stage > InitialSchedule; - S.HasExcessPressure = false; - ScheduleDAGMILive::schedule(); - Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); - RescheduleRegions[RegionIdx] = false; - if (Stage == InitialSchedule && S.HasClusteredNodes) - RegionsWithClusters[RegionIdx] = true; - if (S.HasExcessPressure) - RegionsWithHighRP[RegionIdx] = true; - - if (!LIS) - return; - - // Check the results of scheduling. - auto PressureAfter = getRealRegPressure(); - - LLVM_DEBUG(dbgs() << "Pressure after scheduling: "; - PressureAfter.print(dbgs())); - - if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && - PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { - Pressure[RegionIdx] = PressureAfter; - RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST) == MinOccupancy; - - LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); - return; - } - - unsigned WavesAfter = - std::min(S.TargetOccupancy, PressureAfter.getOccupancy(ST)); - unsigned WavesBefore = - std::min(S.TargetOccupancy, PressureBefore.getOccupancy(ST)); - LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore - << ", after " << WavesAfter << ".\n"); - - // We may not be able to keep the current target occupancy because of the just - // scheduled region. We might still be able to revert scheduling if the - // occupancy before was higher, or if the current schedule has register - // pressure higher than the excess limits which could lead to more spilling. - unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); - - // Allow memory bound functions to drop to 4 waves if not limited by an - // attribute. - if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && - WavesAfter >= MFI.getMinAllowedOccupancy()) { - LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " - << MFI.getMinAllowedOccupancy() << " waves\n"); - NewOccupancy = WavesAfter; - } - - if (NewOccupancy < MinOccupancy) { - MinOccupancy = NewOccupancy; - MFI.limitOccupancy(MinOccupancy); - RegionsWithMinOcc.reset(); - LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " - << MinOccupancy << ".\n"); - } - - unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); - unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); - if (PressureAfter.getVGPRNum(false) > MaxVGPRs || - PressureAfter.getAGPRNum() > MaxVGPRs || - PressureAfter.getSGPRNum() > MaxSGPRs) { - RescheduleRegions[RegionIdx] = true; - RegionsWithHighRP[RegionIdx] = true; - } - - // If this condition is true, then either the occupancy before and after - // scheduling is the same, or we are allowing the occupancy to drop because - // the function is memory bound. Even if we are OK with the current occupancy, - // we still need to verify that we will not introduce any extra chance of - // spilling. - if (WavesAfter >= MinOccupancy) { - if (Stage == UnclusteredReschedule && - !PressureAfter.less(ST, PressureBefore)) { - LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); - } else if (WavesAfter > MFI.getMinWavesPerEU() || - PressureAfter.less(ST, PressureBefore) || - !RescheduleRegions[RegionIdx]) { - Pressure[RegionIdx] = PressureAfter; - RegionsWithMinOcc[RegionIdx] = - PressureAfter.getOccupancy(ST) == MinOccupancy; - if (!RegionsWithClusters[RegionIdx] && - (Stage + 1) == UnclusteredReschedule) - RescheduleRegions[RegionIdx] = false; - return; - } else { - LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); - } - } - - RegionsWithMinOcc[RegionIdx] = - PressureBefore.getOccupancy(ST) == MinOccupancy; - LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); - RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] || - (Stage + 1) != UnclusteredReschedule; - RegionEnd = RegionBegin; - int SkippedDebugInstr = 0; - for (MachineInstr *MI : Unsched) { - if (MI->isDebugInstr()) { - ++SkippedDebugInstr; - continue; - } - - if (MI->getIterator() != RegionEnd) { - BB->remove(MI); - BB->insert(RegionEnd, MI); - if (!MI->isDebugInstr()) - LIS->handleMove(*MI, true); - } - // Reset read-undef flags and update them later. - for (auto &Op : MI->operands()) - if (Op.isReg() && Op.isDef()) - Op.setIsUndef(false); - RegisterOperands RegOpers; - RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); - if (!MI->isDebugInstr()) { - if (ShouldTrackLaneMasks) { - // Adjust liveness and add missing dead+read-undef flags. - SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); - RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); - } else { - // Adjust for missing dead-def flags. - RegOpers.detectDeadDefs(*MI, *LIS); - } - } - RegionEnd = MI->getIterator(); - ++RegionEnd; - LLVM_DEBUG(dbgs() << "Scheduling " << *MI); - } - - // After reverting schedule, debug instrs will now be at the end of the block - // and RegionEnd will point to the first debug instr. Increment RegionEnd - // pass debug instrs to the actual end of the scheduling region. - while (SkippedDebugInstr-- > 0) - ++RegionEnd; - - // If Unsched.front() instruction is a debug instruction, this will actually - // shrink the region since we moved all debug instructions to the end of the - // block. Find the first instruction that is not a debug instruction. - RegionBegin = Unsched.front()->getIterator(); - if (RegionBegin->isDebugInstr()) { - for (MachineInstr *MI : Unsched) { - if (MI->isDebugInstr()) - continue; - RegionBegin = MI->getIterator(); - break; - } - } - - // Then move the debug instructions back into their correct place and set - // RegionBegin and RegionEnd if needed. - placeDebugValues(); - - Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); + // Collect all scheduling regions. The actual scheduling is performed in + // GCNScheduleDAGMILive::finalizeSchedule. + Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); } -GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const { +GCNRegPressure +GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { GCNDownwardRPTracker RPTracker(*LIS); RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); return RPTracker.moveMaxPressure(); } -void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { +void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, + const MachineBasicBlock *MBB) { GCNDownwardRPTracker RPTracker(*LIS); // If the block has the only successor then live-ins of that successor are @@ -542,7 +374,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { RPTracker.reset(*I, &LRS); } - for ( ; ; ) { + for (;;) { I = RPTracker.getNext(); if (Regions[CurRegion].first == I || NonDbgMI == I) { @@ -588,8 +420,9 @@ GCNScheduleDAGMILive::getBBLiveInMap() const { } void GCNScheduleDAGMILive::finalizeSchedule() { - LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); - + // Start actual scheduling here. This function is called by the base + // MachineScheduler after all regions have been recorded by + // GCNScheduleDAGMILive::schedule(). LiveIns.resize(Regions.size()); Pressure.resize(Regions.size()); RescheduleRegions.resize(Regions.size()); @@ -601,142 +434,470 @@ void GCNScheduleDAGMILive::finalizeSchedule() { RegionsWithHighRP.reset(); RegionsWithMinOcc.reset(); + runSchedStages(); +} + +void GCNScheduleDAGMILive::runSchedStages() { + LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); + InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this); + UnclusteredRescheduleStage S1(GCNSchedStageID::UnclusteredReschedule, *this); + ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule, + *this); + PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this); + GCNSchedStage *SchedStages[] = {&S0, &S1, &S2, &S3}; + if (!Regions.empty()) BBLiveInMap = getBBLiveInMap(); - std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations; + for (auto *Stage : SchedStages) { + if (!Stage->initGCNSchedStage()) + continue; - do { - Stage++; - RegionIdx = 0; - MachineBasicBlock *MBB = nullptr; + for (auto Region : Regions) { + RegionBegin = Region.first; + RegionEnd = Region.second; + // Setup for scheduling the region and check whether it should be skipped. + if (!Stage->initGCNRegion()) { + Stage->advanceRegion(); + exitRegion(); + continue; + } - if (Stage > InitialSchedule) { - if (!LIS) - break; + ScheduleDAGMILive::schedule(); + Stage->finalizeGCNRegion(); + } - // Retry function scheduling if we found resulting occupancy and it is - // lower than used for first pass scheduling. This will give more freedom - // to schedule low register pressure blocks. - // Code is partially copied from MachineSchedulerBase::scheduleRegions(). + Stage->finalizeGCNSchedStage(); + } +} - if (Stage == UnclusteredReschedule) { - if (RescheduleRegions.none()) - continue; - LLVM_DEBUG(dbgs() << - "Retrying function scheduling without clustering.\n"); - } +#ifndef NDEBUG +raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { + switch (StageID) { + case GCNSchedStageID::InitialSchedule: + OS << "Initial Schedule"; + break; + case GCNSchedStageID::UnclusteredReschedule: + OS << "Unclustered Reschedule"; + break; + case GCNSchedStageID::ClusteredLowOccupancyReschedule: + OS << "Clustered Low Occupancy Reschedule"; + break; + case GCNSchedStageID::PreRARematerialize: + OS << "Pre-RA Rematerialize"; + break; + } + return OS; +} +#endif - if (Stage == ClusteredLowOccupancyReschedule) { - if (StartingOccupancy <= MinOccupancy) - break; +GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : DAG(DAG), S(static_cast<GCNMaxOccupancySchedStrategy &>(*DAG.SchedImpl)), + MF(DAG.MF), MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} - LLVM_DEBUG( - dbgs() - << "Retrying function scheduling with lowest recorded occupancy " - << MinOccupancy << ".\n"); - } +bool GCNSchedStage::initGCNSchedStage() { + if (!DAG.LIS) + return false; - if (Stage == PreRARematerialize) { - if (RegionsWithMinOcc.none() || Regions.size() == 1) - break; + LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); + return true; +} - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - // Check maximum occupancy - if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == - MinOccupancy) - break; +bool UnclusteredRescheduleStage::initGCNSchedStage() { + if (!GCNSchedStage::initGCNSchedStage()) + return false; - // FIXME: This pass will invalidate cached MBBLiveIns for regions - // inbetween the defs and region we sinked the def to. Cached pressure - // for regions where a def is sinked from will also be invalidated. Will - // need to be fixed if there is another pass after this pass. - static_assert(LastStage == PreRARematerialize, - "Passes after PreRARematerialize are not supported"); + if (DAG.RescheduleRegions.none()) + return false; - collectRematerializableInstructions(); - if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) - break; + SavedMutations.swap(DAG.Mutations); - LLVM_DEBUG( - dbgs() << "Retrying function scheduling with improved occupancy of " - << MinOccupancy << " from rematerializing\n"); - } - } + LLVM_DEBUG(dbgs() << "Retrying function scheduling without clustering.\n"); + return true; +} - if (Stage == UnclusteredReschedule) - SavedMutations.swap(Mutations); +bool ClusteredLowOccStage::initGCNSchedStage() { + if (!GCNSchedStage::initGCNSchedStage()) + return false; - for (auto Region : Regions) { - if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) && - !RescheduleRegions[RegionIdx]) || - (Stage == ClusteredLowOccupancyReschedule && - !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) { + // Don't bother trying to improve ILP in lower RP regions if occupancy has not + // been dropped. All regions will have already been scheduled with the ideal + // occupancy targets. + if (DAG.StartingOccupancy <= DAG.MinOccupancy) + return false; - ++RegionIdx; - continue; - } + LLVM_DEBUG( + dbgs() << "Retrying function scheduling with lowest recorded occupancy " + << DAG.MinOccupancy << ".\n"); + return true; +} - RegionBegin = Region.first; - RegionEnd = Region.second; +bool PreRARematStage::initGCNSchedStage() { + if (!GCNSchedStage::initGCNSchedStage()) + return false; - if (RegionBegin->getParent() != MBB) { - if (MBB) finishBlock(); - MBB = RegionBegin->getParent(); - startBlock(MBB); - if (Stage == InitialSchedule) - computeBlockPressure(MBB); - } + if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1) + return false; - unsigned NumRegionInstrs = std::distance(begin(), end()); - enterRegion(MBB, begin(), end(), NumRegionInstrs); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + // Check maximum occupancy + if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == + DAG.MinOccupancy) + return false; - // Skip empty scheduling regions (0 or 1 schedulable instructions). - if (begin() == end() || begin() == std::prev(end())) { - exitRegion(); - ++RegionIdx; - continue; - } + // FIXME: This pass will invalidate cached MBBLiveIns for regions + // inbetween the defs and region we sinked the def to. Cached pressure + // for regions where a def is sinked from will also be invalidated. Will + // need to be fixed if there is another pass after this pass. - LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); - LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " " - << MBB->getName() << "\n From: " << *begin() - << " To: "; - if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; - else dbgs() << "End"; - dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); + collectRematerializableInstructions(); + if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) + return false; - schedule(); + LLVM_DEBUG( + dbgs() << "Retrying function scheduling with improved occupancy of " + << DAG.MinOccupancy << " from rematerializing\n"); + return true; +} + +void GCNSchedStage::finalizeGCNSchedStage() { + DAG.finishBlock(); + LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); +} + +void UnclusteredRescheduleStage::finalizeGCNSchedStage() { + SavedMutations.swap(DAG.Mutations); + + GCNSchedStage::finalizeGCNSchedStage(); +} + +bool GCNSchedStage::initGCNRegion() { + // Check whether this new region is also a new block. + if (DAG.RegionBegin->getParent() != CurrentMBB) + setupNewBlock(); + + unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); + DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); + + // Skip empty scheduling regions (0 or 1 schedulable instructions). + if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end())) + return false; + + LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); + LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) + << " " << CurrentMBB->getName() + << "\n From: " << *DAG.begin() << " To: "; + if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; + else dbgs() << "End"; + dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); + + // Save original instruction order before scheduling for possible revert. + Unsched.clear(); + Unsched.reserve(DAG.NumRegionInstrs); + for (auto &I : DAG) + Unsched.push_back(&I); + + PressureBefore = DAG.Pressure[RegionIdx]; + + LLVM_DEBUG( + dbgs() << "Pressure before scheduling:\nRegion live-ins:"; + GCNRPTracker::printLiveRegs(dbgs(), DAG.LiveIns[RegionIdx], DAG.MRI); + dbgs() << "Region live-in pressure: "; + llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]).print(dbgs()); + dbgs() << "Region register pressure: "; PressureBefore.print(dbgs())); + + // Set HasClusteredNodes to true for late stages where we have already + // collected it. That way pickNode() will not scan SDep's when not needed. + S.HasClusteredNodes = StageID > GCNSchedStageID::InitialSchedule; + S.HasExcessPressure = false; + + return true; +} + +bool UnclusteredRescheduleStage::initGCNRegion() { + if (!DAG.RescheduleRegions[RegionIdx]) + return false; + + return GCNSchedStage::initGCNRegion(); +} + +bool ClusteredLowOccStage::initGCNRegion() { + // We may need to reschedule this region if it doesn't have clusters so it + // wasn't rescheduled in the last stage, or if we found it was testing + // critical register pressure limits in the unclustered reschedule stage. The + // later is because we may not have been able to raise the min occupancy in + // the previous stage so the region may be overly constrained even if it was + // already rescheduled. + if (!DAG.RegionsWithClusters[RegionIdx] && !DAG.RegionsWithHighRP[RegionIdx]) + return false; + + return GCNSchedStage::initGCNRegion(); +} + +bool PreRARematStage::initGCNRegion() { + if (!DAG.RescheduleRegions[RegionIdx]) + return false; + + return GCNSchedStage::initGCNRegion(); +} + +void GCNSchedStage::setupNewBlock() { + if (CurrentMBB) + DAG.finishBlock(); + + CurrentMBB = DAG.RegionBegin->getParent(); + DAG.startBlock(CurrentMBB); + // Get real RP for the region if it hasn't be calculated before. After the + // initial schedule stage real RP will be collected after scheduling. + if (StageID == GCNSchedStageID::InitialSchedule) + DAG.computeBlockPressure(RegionIdx, CurrentMBB); +} + +void GCNSchedStage::finalizeGCNRegion() { + DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd); + DAG.RescheduleRegions[RegionIdx] = false; + if (S.HasExcessPressure) + DAG.RegionsWithHighRP[RegionIdx] = true; + + // Revert scheduling if we have dropped occupancy or there is some other + // reason that the original schedule is better. + checkScheduling(); + + DAG.exitRegion(); + RegionIdx++; +} + +void InitialScheduleStage::finalizeGCNRegion() { + // Record which regions have clustered nodes for the next unclustered + // reschedule stage. + assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule); + if (S.HasClusteredNodes) + DAG.RegionsWithClusters[RegionIdx] = true; + + GCNSchedStage::finalizeGCNRegion(); +} + +void GCNSchedStage::checkScheduling() { + // Check the results of scheduling. + PressureAfter = DAG.getRealRegPressure(RegionIdx); + LLVM_DEBUG(dbgs() << "Pressure after scheduling: "; + PressureAfter.print(dbgs())); + + if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && + PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { + DAG.Pressure[RegionIdx] = PressureAfter; + DAG.RegionsWithMinOcc[RegionIdx] = + PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; + + // Early out if we have achieve the occupancy target. + LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); + return; + } + + unsigned WavesAfter = + std::min(S.getTargetOccupancy(), PressureAfter.getOccupancy(ST)); + unsigned WavesBefore = + std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); + LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore + << ", after " << WavesAfter << ".\n"); + + // We may not be able to keep the current target occupancy because of the just + // scheduled region. We might still be able to revert scheduling if the + // occupancy before was higher, or if the current schedule has register + // pressure higher than the excess limits which could lead to more spilling. + unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); + + // Allow memory bound functions to drop to 4 waves if not limited by an + // attribute. + if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && + WavesAfter >= MFI.getMinAllowedOccupancy()) { + LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " + << MFI.getMinAllowedOccupancy() << " waves\n"); + NewOccupancy = WavesAfter; + } + + if (NewOccupancy < DAG.MinOccupancy) { + DAG.MinOccupancy = NewOccupancy; + MFI.limitOccupancy(DAG.MinOccupancy); + DAG.RegionsWithMinOcc.reset(); + LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " + << DAG.MinOccupancy << ".\n"); + } - exitRegion(); - ++RegionIdx; + unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); + if (PressureAfter.getVGPRNum(false) > MaxVGPRs || + PressureAfter.getAGPRNum() > MaxVGPRs || + PressureAfter.getSGPRNum() > MaxSGPRs) { + DAG.RescheduleRegions[RegionIdx] = true; + DAG.RegionsWithHighRP[RegionIdx] = true; + } + + // Revert if this region's schedule would cause a drop in occupancy or + // spilling. + if (shouldRevertScheduling(WavesAfter)) { + revertScheduling(); + } else { + DAG.Pressure[RegionIdx] = PressureAfter; + DAG.RegionsWithMinOcc[RegionIdx] = + PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; + } +} + +bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { + if (WavesAfter < DAG.MinOccupancy) + return true; + + return false; +} + +bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { + if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) + return true; + + if (mayCauseSpilling(WavesAfter)) + return true; + + assert(nextStage(StageID) == GCNSchedStageID::UnclusteredReschedule); + // Don't reschedule the region in the next stage if it doesn't have clusters. + if (!DAG.RegionsWithClusters[RegionIdx]) + DAG.RescheduleRegions[RegionIdx] = false; + + return false; +} + +bool UnclusteredRescheduleStage::shouldRevertScheduling(unsigned WavesAfter) { + if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) + return true; + + // If RP is not reduced in the unclustred reschedule stage, revert to the old + // schedule. + if (!PressureAfter.less(ST, PressureBefore)) { + LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); + return true; + } + + return false; +} + +bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { + if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) + return true; + + if (mayCauseSpilling(WavesAfter)) + return true; + + return false; +} + +bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { + if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) + return true; + + if (mayCauseSpilling(WavesAfter)) + return true; + + return false; +} + +bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { + if (WavesAfter <= MFI.getMinWavesPerEU() && + !PressureAfter.less(ST, PressureBefore) && + DAG.RescheduleRegions[RegionIdx]) { + LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); + return true; + } + + return false; +} + +void GCNSchedStage::revertScheduling() { + DAG.RegionsWithMinOcc[RegionIdx] = + PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; + LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); + DAG.RescheduleRegions[RegionIdx] = + DAG.RegionsWithClusters[RegionIdx] || + (nextStage(StageID)) != GCNSchedStageID::UnclusteredReschedule; + DAG.RegionEnd = DAG.RegionBegin; + int SkippedDebugInstr = 0; + for (MachineInstr *MI : Unsched) { + if (MI->isDebugInstr()) { + ++SkippedDebugInstr; + continue; + } + + if (MI->getIterator() != DAG.RegionEnd) { + DAG.BB->remove(MI); + DAG.BB->insert(DAG.RegionEnd, MI); + if (!MI->isDebugInstr()) + DAG.LIS->handleMove(*MI, true); + } + + // Reset read-undef flags and update them later. + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef()) + Op.setIsUndef(false); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); + if (!MI->isDebugInstr()) { + if (DAG.ShouldTrackLaneMasks) { + // Adjust liveness and add missing dead+read-undef flags. + SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); + } else { + // Adjust for missing dead-def flags. + RegOpers.detectDeadDefs(*MI, *DAG.LIS); + } } - finishBlock(); + DAG.RegionEnd = MI->getIterator(); + ++DAG.RegionEnd; + LLVM_DEBUG(dbgs() << "Scheduling " << *MI); + } + + // After reverting schedule, debug instrs will now be at the end of the block + // and RegionEnd will point to the first debug instr. Increment RegionEnd + // pass debug instrs to the actual end of the scheduling region. + while (SkippedDebugInstr-- > 0) + ++DAG.RegionEnd; + + // If Unsched.front() instruction is a debug instruction, this will actually + // shrink the region since we moved all debug instructions to the end of the + // block. Find the first instruction that is not a debug instruction. + DAG.RegionBegin = Unsched.front()->getIterator(); + if (DAG.RegionBegin->isDebugInstr()) { + for (MachineInstr *MI : Unsched) { + if (MI->isDebugInstr()) + continue; + DAG.RegionBegin = MI->getIterator(); + break; + } + } + + // Then move the debug instructions back into their correct place and set + // RegionBegin and RegionEnd if needed. + DAG.placeDebugValues(); - if (Stage == UnclusteredReschedule) - SavedMutations.swap(Mutations); - } while (Stage != LastStage); + DAG.Regions[RegionIdx] = std::make_pair(DAG.RegionBegin, DAG.RegionEnd); } -void GCNScheduleDAGMILive::collectRematerializableInstructions() { - const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); - for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { +void PreRARematStage::collectRematerializableInstructions() { + const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI); + for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) { Register Reg = Register::index2VirtReg(I); - if (!LIS->hasInterval(Reg)) + if (!DAG.LIS->hasInterval(Reg)) continue; // TODO: Handle AGPR and SGPR rematerialization - if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) || - !MRI.hasOneNonDBGUse(Reg)) + if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) || + !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg)) continue; - MachineOperand *Op = MRI.getOneDef(Reg); + MachineOperand *Op = DAG.MRI.getOneDef(Reg); MachineInstr *Def = Op->getParent(); if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def)) continue; - MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg); + MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg); if (Def->getParent() == UseI->getParent()) continue; @@ -744,10 +905,10 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() { // live-through or used inside regions at MinOccupancy. This means that the // register must be in the live-in set for the region. bool AddedToRematList = false; - for (unsigned I = 0, E = Regions.size(); I != E; ++I) { - auto It = LiveIns[I].find(Reg); - if (It != LiveIns[I].end() && !It->second.none()) { - if (RegionsWithMinOcc[I]) { + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + auto It = DAG.LiveIns[I].find(Reg); + if (It != DAG.LiveIns[I].end() && !It->second.none()) { + if (DAG.RegionsWithMinOcc[I]) { RematerializableInsts[I][Def] = UseI; AddedToRematList = true; } @@ -762,8 +923,8 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() { } } -bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, - const TargetInstrInfo *TII) { +bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, + const TargetInstrInfo *TII) { // Temporary copies of cached variables we will be modifying and replacing if // sinking succeeds. SmallVector< @@ -772,9 +933,10 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; DenseMap<unsigned, GCNRegPressure> NewPressure; BitVector NewRescheduleRegions; + LiveIntervals *LIS = DAG.LIS; - NewRegions.resize(Regions.size()); - NewRescheduleRegions.resize(Regions.size()); + NewRegions.resize(DAG.Regions.size()); + NewRescheduleRegions.resize(DAG.Regions.size()); // Collect only regions that has a rematerializable def as a live-in. SmallSet<unsigned, 16> ImpactedRegions; @@ -784,16 +946,16 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // Make copies of register pressure and live-ins cache that will be updated // as we rematerialize. for (auto Idx : ImpactedRegions) { - NewPressure[Idx] = Pressure[Idx]; - NewLiveIns[Idx] = LiveIns[Idx]; + NewPressure[Idx] = DAG.Pressure[Idx]; + NewLiveIns[Idx] = DAG.LiveIns[Idx]; } - NewRegions = Regions; + NewRegions = DAG.Regions; NewRescheduleRegions.reset(); DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; bool Improved = false; for (auto I : ImpactedRegions) { - if (!RegionsWithMinOcc[I]) + if (!DAG.RegionsWithMinOcc[I]) continue; Improved = false; @@ -802,12 +964,12 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // TODO: Handle occupancy drop due to AGPR and SGPR. // Check if cause of occupancy drop is due to VGPR usage and not SGPR. - if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy) + if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy) break; // The occupancy of this region could have been improved by a previous // iteration's sinking of defs. - if (NewPressure[I].getOccupancy(ST) > MinOccupancy) { + if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) { NewRescheduleRegions[I] = true; Improved = true; continue; @@ -827,7 +989,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); // If in the most optimistic scenario, we cannot improve occupancy, then do // not attempt to sink any instructions. - if (OptimisticOccupancy <= MinOccupancy) + if (OptimisticOccupancy <= DAG.MinOccupancy) break; unsigned ImproveOccupancy = 0; @@ -842,7 +1004,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // call LiveRangeEdit::allUsesAvailableAt() and // LiveRangeEdit::canRematerializeAt(). TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, - Def->getOperand(0).getSubReg(), *Def, *TRI); + Def->getOperand(0).getSubReg(), *Def, *DAG.TRI); MachineInstr *NewMI = &*(--InsertPos); LIS->InsertMachineInstrInMaps(*NewMI); LIS->removeInterval(Reg); @@ -851,11 +1013,11 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // Update region boundaries in scheduling region we sinked from since we // may sink an instruction that was at the beginning or end of its region - updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, - /*Removing =*/true); + DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, + /*Removing =*/true); // Update region boundaries in region we sinked to. - updateRegionBoundaries(NewRegions, InsertPos, NewMI); + DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI); LaneBitmask PrevMask = NewLiveIns[I][Reg]; // FIXME: Also update cached pressure for where the def was sinked from. @@ -863,9 +1025,9 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // the reg from all regions as a live-in. for (auto Idx : RematDefToLiveInRegions[Def]) { NewLiveIns[Idx].erase(Reg); - if (InsertPos->getParent() != Regions[Idx].first->getParent()) { + if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) { // Def is live-through and not used in this block. - NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI); + NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI); } else { // Def is used and rematerialized into this block. GCNDownwardRPTracker RPT(*LIS); @@ -879,7 +1041,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, SinkedDefs.push_back(Def); ImproveOccupancy = NewPressure[I].getOccupancy(ST); - if (ImproveOccupancy > MinOccupancy) + if (ImproveOccupancy > DAG.MinOccupancy) break; } @@ -888,7 +1050,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, for (auto TrackedIdx : RematDefToLiveInRegions[Def]) RematerializableInsts[TrackedIdx].erase(Def); - if (ImproveOccupancy <= MinOccupancy) + if (ImproveOccupancy <= DAG.MinOccupancy) break; NewRescheduleRegions[I] = true; @@ -917,7 +1079,7 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, MachineInstr *OldMI = Entry.second; // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. - BBLiveInMap.erase(OldMI); + DAG.BBLiveInMap.erase(OldMI); // Remove OldMI and update LIS Register Reg = MI->getOperand(0).getReg(); @@ -929,22 +1091,22 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, // Update live-ins, register pressure, and regions caches. for (auto Idx : ImpactedRegions) { - LiveIns[Idx] = NewLiveIns[Idx]; - Pressure[Idx] = NewPressure[Idx]; - MBBLiveIns.erase(Regions[Idx].first->getParent()); + DAG.LiveIns[Idx] = NewLiveIns[Idx]; + DAG.Pressure[Idx] = NewPressure[Idx]; + DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent()); } - Regions = NewRegions; - RescheduleRegions = NewRescheduleRegions; + DAG.Regions = NewRegions; + DAG.RescheduleRegions = NewRescheduleRegions; SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - MFI.increaseOccupancy(MF, ++MinOccupancy); + MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); return true; } // Copied from MachineLICM -bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI) { - if (!TII->isTriviallyReMaterializable(MI)) +bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { + if (!DAG.TII->isTriviallyReMaterializable(MI)) return false; for (const MachineOperand &MO : MI.operands()) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index c3db849cf81a..7aadf89e0bf7 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -28,8 +28,6 @@ class GCNSubtarget; /// heuristics to determine excess/critical pressure sets. Its goal is to /// maximize kernel occupancy (i.e. maximum number of waves per simd). class GCNMaxOccupancySchedStrategy final : public GenericScheduler { - friend class GCNScheduleDAGMILive; - SUnit *pickNodeBidirectional(bool &IsTopNode); void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, @@ -42,15 +40,18 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler { unsigned SGPRPressure, unsigned VGPRPressure); std::vector<unsigned> Pressure; + std::vector<unsigned> MaxPressure; unsigned SGPRExcessLimit; + unsigned VGPRExcessLimit; - unsigned SGPRCriticalLimit; - unsigned VGPRCriticalLimit; unsigned TargetOccupancy; + MachineFunction *MF; + +public: // schedule() have seen a clustered memory operation. Set it to false // before a region scheduling to know if the region had such clusters. bool HasClusteredNodes; @@ -59,28 +60,53 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler { // register pressure for actual scheduling heuristics. bool HasExcessPressure; - MachineFunction *MF; + unsigned SGPRCriticalLimit; + + unsigned VGPRCriticalLimit; -public: GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); SUnit *pickNode(bool &IsTopNode) override; void initialize(ScheduleDAGMI *DAG) override; + unsigned getTargetOccupancy() { return TargetOccupancy; } + void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; } }; -class GCNScheduleDAGMILive final : public ScheduleDAGMILive { +enum class GCNSchedStageID : unsigned { + InitialSchedule = 0, + UnclusteredReschedule = 1, + ClusteredLowOccupancyReschedule = 2, + PreRARematerialize = 3, + LastStage = PreRARematerialize +}; + +#ifndef NDEBUG +raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID); +#endif + +inline GCNSchedStageID &operator++(GCNSchedStageID &Stage, int) { + assert(Stage != GCNSchedStageID::PreRARematerialize); + Stage = static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1); + return Stage; +} + +inline GCNSchedStageID nextStage(const GCNSchedStageID Stage) { + return static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1); +} - enum : unsigned { - Collect, - InitialSchedule, - UnclusteredReschedule, - ClusteredLowOccupancyReschedule, - PreRARematerialize, - LastStage = PreRARematerialize - }; +inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) { + return static_cast<unsigned>(LHS) > static_cast<unsigned>(RHS); +} + +class GCNScheduleDAGMILive final : public ScheduleDAGMILive { + friend class GCNSchedStage; + friend class InitialScheduleStage; + friend class UnclusteredRescheduleStage; + friend class ClusteredLowOccStage; + friend class PreRARematStage; const GCNSubtarget &ST; @@ -92,12 +118,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // Minimal real occupancy recorder for the function. unsigned MinOccupancy; - // Scheduling stage number. - unsigned Stage; - - // Current region index. - size_t RegionIdx; - // Vector of regions recorder for later rescheduling SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> Regions; @@ -121,6 +141,148 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // Region pressure cache. SmallVector<GCNRegPressure, 32> Pressure; + // Temporary basic block live-in cache. + DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns; + + DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap; + + DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const; + + // Return current region pressure. + GCNRegPressure getRealRegPressure(unsigned RegionIdx) const; + + // Compute and cache live-ins and pressure for all regions in block. + void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB); + + // Update region boundaries when removing MI or inserting NewMI before MI. + void updateRegionBoundaries( + SmallVectorImpl<std::pair<MachineBasicBlock::iterator, + MachineBasicBlock::iterator>> &RegionBoundaries, + MachineBasicBlock::iterator MI, MachineInstr *NewMI, + bool Removing = false); + + void runSchedStages(); + +public: + GCNScheduleDAGMILive(MachineSchedContext *C, + std::unique_ptr<MachineSchedStrategy> S); + + void schedule() override; + + void finalizeSchedule() override; +}; + +// GCNSchedStrategy applies multiple scheduling stages to a function. +class GCNSchedStage { +protected: + GCNScheduleDAGMILive &DAG; + + GCNMaxOccupancySchedStrategy &S; + + MachineFunction &MF; + + SIMachineFunctionInfo &MFI; + + const GCNSubtarget &ST; + + const GCNSchedStageID StageID; + + // The current block being scheduled. + MachineBasicBlock *CurrentMBB = nullptr; + + // Current region index. + unsigned RegionIdx = 0; + + // Record the original order of instructions before scheduling. + std::vector<MachineInstr *> Unsched; + + // RP before scheduling the current region. + GCNRegPressure PressureBefore; + + // RP after scheduling the current region. + GCNRegPressure PressureAfter; + + GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG); + +public: + // Initialize state for a scheduling stage. Returns false if the current stage + // should be skipped. + virtual bool initGCNSchedStage(); + + // Finalize state after finishing a scheduling pass on the function. + virtual void finalizeGCNSchedStage(); + + // Setup for scheduling a region. Returns false if the current region should + // be skipped. + virtual bool initGCNRegion(); + + // Track whether a new region is also a new MBB. + void setupNewBlock(); + + // Finalize state after scheudling a region. + virtual void finalizeGCNRegion(); + + // Check result of scheduling. + void checkScheduling(); + + // Returns true if scheduling should be reverted. + virtual bool shouldRevertScheduling(unsigned WavesAfter); + + // Returns true if the new schedule may result in more spilling. + bool mayCauseSpilling(unsigned WavesAfter); + + // Attempt to revert scheduling for this region. + void revertScheduling(); + + void advanceRegion() { RegionIdx++; } + + virtual ~GCNSchedStage() = default; +}; + +class InitialScheduleStage : public GCNSchedStage { +public: + void finalizeGCNRegion() override; + + bool shouldRevertScheduling(unsigned WavesAfter) override; + + InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : GCNSchedStage(StageID, DAG) {} +}; + +class UnclusteredRescheduleStage : public GCNSchedStage { +private: + std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations; + +public: + bool initGCNSchedStage() override; + + void finalizeGCNSchedStage() override; + + bool initGCNRegion() override; + + bool shouldRevertScheduling(unsigned WavesAfter) override; + + UnclusteredRescheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : GCNSchedStage(StageID, DAG) {} +}; + +// Retry function scheduling if we found resulting occupancy and it is +// lower than used for other scheduling passes. This will give more freedom +// to schedule low register pressure blocks. +class ClusteredLowOccStage : public GCNSchedStage { +public: + bool initGCNSchedStage() override; + + bool initGCNRegion() override; + + bool shouldRevertScheduling(unsigned WavesAfter) override; + + ClusteredLowOccStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : GCNSchedStage(StageID, DAG) {} +}; + +class PreRARematStage : public GCNSchedStage { +private: // Each region at MinOccupancy will have their own list of trivially // rematerializable instructions we can remat to reduce RP. The list maps an // instruction to the position we should remat before, usually the MI using @@ -132,12 +294,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // that has the defined reg as a live-in. DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions; - // Temporary basic block live-in cache. - DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns; - - DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap; - DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const; - // Collect all trivially rematerializable VGPR instructions with a single def // and single use outside the defining block into RematerializableInsts. void collectRematerializableInstructions(); @@ -150,26 +306,15 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { bool sinkTriviallyRematInsts(const GCNSubtarget &ST, const TargetInstrInfo *TII); - // Return current region pressure. - GCNRegPressure getRealRegPressure() const; - - // Compute and cache live-ins and pressure for all regions in block. - void computeBlockPressure(const MachineBasicBlock *MBB); - - // Update region boundaries when removing MI or inserting NewMI before MI. - void updateRegionBoundaries( - SmallVectorImpl<std::pair<MachineBasicBlock::iterator, - MachineBasicBlock::iterator>> &RegionBoundaries, - MachineBasicBlock::iterator MI, MachineInstr *NewMI, - bool Removing = false); - public: - GCNScheduleDAGMILive(MachineSchedContext *C, - std::unique_ptr<MachineSchedStrategy> S); + bool initGCNSchedStage() override; - void schedule() override; + bool initGCNRegion() override; - void finalizeSchedule() override; + bool shouldRevertScheduling(unsigned WavesAfter) override; + + PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) + : GCNSchedStage(StageID, DAG) {} }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index e093d78b2cc6..d9d7d4efa8c3 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -309,6 +309,11 @@ uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const { return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2; } +static bool isVCMPX64(const MCInstrDesc &Desc) { + return (Desc.TSFlags & SIInstrFlags::VOP3) && + Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC); +} + void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -326,6 +331,17 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, Encoding |= getImplicitOpSelHiEncoding(Opcode); } + // GFX11 v_cmpx opcodes promoted to VOP3 have implied dst=EXEC. + // Documentation requires dst to be encoded as EXEC (0x7E), + // but it looks like the actual value encoded for dst operand + // is ignored by HW. It was decided to define dst as "do not care" + // in td files to allow disassembler accept any dst value. + // However, dst is encoded as EXEC for compatibility with SP3. + if (AMDGPU::isGFX11Plus(STI) && isVCMPX64(Desc)) { + assert((Encoding & 0xFF) == 0); + Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO); + } + for (unsigned i = 0; i < bytes; i++) { OS.write((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i)); } diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index e7706fa0ef5c..1ed79add64c9 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -54,8 +54,8 @@ public: MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *IsFast = nullptr) const override; - virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT, - bool LegalOperations) const override { + bool canCombineTruncStore(EVT ValVT, EVT MemVT, + bool LegalOperations) const override { // R600 has "custom" lowering for truncating stores despite not supporting // those instructions. If we allow that custom lowering in the DAG combiner // then all truncates are merged into truncating stores, giving worse code diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 438e8b200ecc..f7d139adc63b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2132,7 +2132,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const { - if (Subtarget->hasUserSGPRInit16Bug()) { + if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { + // Note: user SGPRs are handled by the front-end for graphics shaders // Pad up the used user SGPRs with dead inputs. unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); @@ -2195,7 +2196,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); } - assert(!Subtarget->hasUserSGPRInit16Bug() || Info.getNumPreloadedSGPRs() >= 16); + assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader || + Info.getNumPreloadedSGPRs() >= 16); } static void reservePrivateMemoryRegs(const TargetMachine &TM, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index d1fecc1afc7f..e0101f53880f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -487,10 +487,10 @@ public: AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; - virtual const TargetRegisterClass * - getRegClassFor(MVT VT, bool isDivergent) const override; - virtual bool requiresUniformRegister(MachineFunction &MF, - const Value *V) const override; + const TargetRegisterClass *getRegClassFor(MVT VT, + bool isDivergent) const override; + bool requiresUniformRegister(MachineFunction &MF, + const Value *V) const override; Align getPrefLoopAlignment(MachineLoop *ML) const override; void allocateHSAUserSGPRs(CCState &CCInfo, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index ffe8dce79816..fccb08f86e6d 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -349,7 +349,7 @@ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> { def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add (sequence "SGPR%u_LO16", 0, 105))> { - let AllocationPriority = 9; + let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; let HasSGPR = 1; @@ -368,7 +368,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "SGPR%u", 0, 105))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. - let AllocationPriority = 9; + let AllocationPriority = 0; let GeneratePressureSet = 0; let HasSGPR = 1; } @@ -528,14 +528,14 @@ def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>; let HasVGPR = 1 in { def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (sequence "VGPR%u_LO16", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; } def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (sequence "VGPR%u_HI16", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; } @@ -544,7 +544,7 @@ def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 32; let Weight = 1; } @@ -588,7 +588,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // AccVGPR 32-bit registers def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "AGPR%u", 0, 255))> { - let AllocationPriority = 1; + let AllocationPriority = 0; let Size = 32; let Weight = 1; } @@ -653,7 +653,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2 SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, SRC_VCCZ, SRC_EXECZ, SRC_SCC)> { - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16, @@ -663,42 +663,42 @@ def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16, SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16_XM0 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> { let Size = 16; - let AllocationPriority = 10; + let AllocationPriority = 0; } } // End GeneratePressureSet = 0 // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { - let AllocationPriority = 10; + let AllocationPriority = 0; let HasSGPR = 1; } @@ -712,7 +712,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { let CopyCost = 1; - let AllocationPriority = 11; + let AllocationPriority = 1; let HasSGPR = 1; } @@ -725,14 +725,14 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, TTMP_64, TBA, TMA)> { let CopyCost = 1; - let AllocationPriority = 13; + let AllocationPriority = 1; let HasSGPR = 1; } def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; - let AllocationPriority = 13; + let AllocationPriority = 1; let HasSGPR = 1; } @@ -750,7 +750,7 @@ def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, let HasSGPR = 1; } -multiclass SRegClass<int numRegs, int priority, +multiclass SRegClass<int numRegs, list<ValueType> regTypes, SIRegisterTuples regList, SIRegisterTuples ttmpList = regList, @@ -760,7 +760,7 @@ multiclass SRegClass<int numRegs, int priority, defvar sgprName = !strconcat("SGPR_", suffix); defvar ttmpName = !strconcat("TTMP_", suffix); - let AllocationPriority = priority, CopyCost = copyCost, HasSGPR = 1 in { + let AllocationPriority = !sub(numRegs, 1), CopyCost = copyCost, HasSGPR = 1 in { def "" # sgprName : SIRegisterClass<"AMDGPU", regTypes, 32, (add regList)> { } @@ -781,14 +781,14 @@ multiclass SRegClass<int numRegs, int priority, } } -defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>; -defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; -defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; -defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; -defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; -defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; -defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; +defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; +defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; +defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; +defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; +defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; +defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add VGPR_32, LDS_DIRECT_CLASS)> { @@ -803,7 +803,7 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> : // Requires n v_mov_b32 to copy let CopyCost = numRegs; - let AllocationPriority = numRegs; + let AllocationPriority = !sub(numRegs, 1); let Weight = numRegs; } diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index d489a089ac78..5973d32c91d6 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -718,7 +718,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : // DPP8 forbids modifiers and can inherit from VOPC_Profile let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - dag InsPartVOP3DPP = (ins Src0Mod:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1); + dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1); let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel), (ins))); let Asm64 = "$sdst, $src0_modifiers, $src1"; |
