diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIFoldOperands.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 235 |
1 files changed, 184 insertions, 51 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 4eac03168760..2ff8baf29394 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -14,7 +14,7 @@ #include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -312,6 +312,19 @@ static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, return false; } +static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, + MachineInstr *MI, unsigned OpNo, + MachineOperand *FoldOp, bool Commuted = false, + int ShrinkOp = -1) { + // Skip additional folding on the same operand. + for (FoldCandidate &Fold : FoldList) + if (Fold.UseMI == MI && Fold.UseOpNo == OpNo) + return; + LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal") + << " operand " << OpNo << "\n " << *MI << '\n'); + FoldList.push_back(FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); +} + static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, @@ -344,7 +357,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // Special case for s_setreg_b32 if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) { MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32)); - FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); + appendFoldCandidate(FoldList, MI, OpNo, OpToFold); return true; } @@ -403,8 +416,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, unsigned MaybeCommutedOpc = MI->getOpcode(); int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); - FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true, - Op32)); + appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32); return true; } @@ -412,11 +424,34 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, return false; } - FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true)); + appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true); return true; } - FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); + // Check the case where we might introduce a second constant operand to a + // scalar instruction + if (TII->isSALU(MI->getOpcode())) { + const MCInstrDesc &InstDesc = MI->getDesc(); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; + const SIRegisterInfo &SRI = TII->getRegisterInfo(); + + // Fine if the operand can be encoded as an inline constant + if (OpToFold->isImm()) { + if (!SRI.opCanUseInlineConstant(OpInfo.OperandType) || + !TII->isInlineConstant(*OpToFold, OpInfo)) { + // Otherwise check for another constant + for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) { + auto &Op = MI->getOperand(i); + if (OpNo != i && + TII->isLiteralConstantLike(Op, OpInfo)) { + return false; + } + } + } + } + } + + appendFoldCandidate(FoldList, MI, OpNo, OpToFold); return true; } @@ -429,6 +464,42 @@ static bool isUseSafeToFold(const SIInstrInfo *TII, //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } +// Find a def of the UseReg, check if it is a reg_seqence and find initializers +// for each subreg, tracking it to foldable inline immediate if possible. +// Returns true on success. +static bool getRegSeqInit( + SmallVectorImpl<std::pair<MachineOperand*, unsigned>> &Defs, + Register UseReg, uint8_t OpTy, + const SIInstrInfo *TII, const MachineRegisterInfo &MRI) { + MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); + if (!Def || !Def->isRegSequence()) + return false; + + for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { + MachineOperand *Sub = &Def->getOperand(I); + assert (Sub->isReg()); + + for (MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub->getReg()); + SubDef && Sub->isReg() && !Sub->getSubReg() && + TII->isFoldableCopy(*SubDef); + SubDef = MRI.getUniqueVRegDef(Sub->getReg())) { + MachineOperand *Op = &SubDef->getOperand(1); + if (Op->isImm()) { + if (TII->isInlineConstant(*Op, OpTy)) + Sub = Op; + break; + } + if (!Op->isReg()) + break; + Sub = Op; + } + + Defs.push_back(std::make_pair(Sub, Def->getOperand(I + 1).getImm())); + } + + return true; +} + static bool tryToFoldACImm(const SIInstrInfo *TII, const MachineOperand &OpToFold, MachineInstr *UseMI, @@ -462,39 +533,30 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, return false; MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo(); - const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); - if (!Def || !Def->isRegSequence()) + SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; + if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI)) return false; - int64_t Imm; - MachineOperand *Op; - for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { - const MachineOperand &Sub = Def->getOperand(I); - if (!Sub.isReg() || Sub.getSubReg()) - return false; - MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg()); - while (SubDef && !SubDef->isMoveImmediate() && - !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef)) - SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg()); - if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm()) + int32_t Imm; + for (unsigned I = 0, E = Defs.size(); I != E; ++I) { + const MachineOperand *Op = Defs[I].first; + if (!Op->isImm()) return false; - Op = &SubDef->getOperand(1); + auto SubImm = Op->getImm(); - if (I == 1) { - if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy)) + if (!I) { + Imm = SubImm; + if (!TII->isInlineConstant(*Op, OpTy) || + !TII->isOperandLegal(*UseMI, UseOpIdx, Op)) return false; - Imm = SubImm; continue; } if (Imm != SubImm) return false; // Can only fold splat constants } - if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op)) - return false; - - FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); + appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first); return true; } @@ -513,18 +575,6 @@ void SIFoldOperands::foldOperand( if (UseOp.isReg() && OpToFold.isReg()) { if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) return; - - // Don't fold subregister extracts into tied operands, only if it is a full - // copy since a subregister use tied to a full register def doesn't really - // make sense. e.g. don't fold: - // - // %1 = COPY %0:sub1 - // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0> - // - // into - // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0> - if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) - return; } // Special case for REG_SEQUENCE: We can't fold literals into @@ -639,16 +689,97 @@ void SIFoldOperands::foldOperand( CopiesToReplace.push_back(UseMI); } else { if (UseMI->isCopy() && OpToFold.isReg() && - Register::isVirtualRegister(UseMI->getOperand(0).getReg()) && - TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) && - TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) && + UseMI->getOperand(0).getReg().isVirtual() && !UseMI->getOperand(1).getSubReg()) { + LLVM_DEBUG(dbgs() << "Folding " << OpToFold + << "\n into " << *UseMI << '\n'); unsigned Size = TII->getOpSize(*UseMI, 1); - UseMI->getOperand(1).setReg(OpToFold.getReg()); + Register UseReg = OpToFold.getReg(); + UseMI->getOperand(1).setReg(UseReg); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); CopiesToReplace.push_back(UseMI); OpToFold.setIsKill(false); + + // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32 + // can only accept VGPR or inline immediate. Recreate a reg_sequence with + // its initializers right here, so we will rematerialize immediates and + // avoid copies via different reg classes. + SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs; + if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && + getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32, TII, + *MRI)) { + const DebugLoc &DL = UseMI->getDebugLoc(); + MachineBasicBlock &MBB = *UseMI->getParent(); + + UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); + for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I) + UseMI->RemoveOperand(I); + + MachineInstrBuilder B(*MBB.getParent(), UseMI); + DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies; + SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs; + for (unsigned I = 0; I < Size / 4; ++I) { + MachineOperand *Def = Defs[I].first; + TargetInstrInfo::RegSubRegPair CopyToVGPR; + if (Def->isImm() && + TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + int64_t Imm = Def->getImm(); + + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, UseMI, DL, + TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addImm(Imm); + B.addReg(Tmp); + } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) { + auto Src = getRegSubRegPair(*Def); + Def->setIsKill(false); + if (!SeenAGPRs.insert(Src)) { + // We cannot build a reg_sequence out of the same registers, they + // must be copied. Better do it here before copyPhysReg() created + // several reads to do the AGPR->VGPR->AGPR copy. + CopyToVGPR = Src; + } else { + B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, + Src.SubReg); + } + } else { + assert(Def->isReg()); + Def->setIsKill(false); + auto Src = getRegSubRegPair(*Def); + + // Direct copy from SGPR to AGPR is not possible. To avoid creation + // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, + // create a copy here and track if we already have such a copy. + if (TRI->isSGPRReg(*MRI, Src.Reg)) { + CopyToVGPR = Src; + } else { + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def); + B.addReg(Tmp); + } + } + + if (CopyToVGPR.Reg) { + Register Vgpr; + if (VGPRCopies.count(CopyToVGPR)) { + Vgpr = VGPRCopies[CopyToVGPR]; + } else { + Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def); + VGPRCopies[CopyToVGPR] = Vgpr; + } + auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + BuildMI(MBB, UseMI, DL, + TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addReg(Vgpr); + B.addReg(Tmp); + } + + B.addImm(Defs[I].second); + } + LLVM_DEBUG(dbgs() << "Folded " << *UseMI << '\n'); + return; + } + if (Size != 4) return; if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && @@ -1099,6 +1230,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { + assert(!Fold.isReg() || Fold.OpToFold); if (Fold.isReg() && Register::isVirtualRegister(Fold.OpToFold->getReg())) { Register Reg = Fold.OpToFold->getReg(); MachineInstr *DefMI = Fold.OpToFold->getParent(); @@ -1249,8 +1381,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { case AMDGPU::V_MUL_F32_e64: case AMDGPU::V_MUL_F16_e64: { // If output denormals are enabled, omod is ignored. - if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) || - (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals())) + if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32Denormals) || + (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16Denormals)) return std::make_pair(nullptr, SIOutMods::NONE); const MachineOperand *RegOp = nullptr; @@ -1279,8 +1411,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { case AMDGPU::V_ADD_F32_e64: case AMDGPU::V_ADD_F16_e64: { // If output denormals are enabled, omod is ignored. - if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) || - (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals())) + if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32Denormals) || + (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16Denormals)) return std::make_pair(nullptr, SIOutMods::NONE); // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x @@ -1358,15 +1490,16 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { tryFoldInst(TII, &MI); if (!TII->isFoldableCopy(MI)) { + // Saw an unknown clobber of m0, so we no longer know what it is. + if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI)) + CurrentKnownM0Val = nullptr; + // TODO: Omod might be OK if there is NSZ only on the source // instruction, and not the omod multiply. if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || !tryFoldOMod(MI)) tryFoldClamp(MI); - // Saw an unknown clobber of m0, so we no longer know what it is. - if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI)) - CurrentKnownM0Val = nullptr; continue; } @@ -1409,5 +1542,5 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { foldInstOperand(MI, OpToFold); } } - return false; + return true; } |