aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp326
1 files changed, 326 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
new file mode 100644
index 000000000000..f31c722db1b2
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -0,0 +1,326 @@
+//===-- SIPreEmitPeephole.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass performs the peephole optimizations before code emission.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-pre-emit-peephole"
+
+namespace {
+
+class SIPreEmitPeephole : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII = nullptr;
+ const SIRegisterInfo *TRI = nullptr;
+
+ bool optimizeVccBranch(MachineInstr &MI) const;
+ bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
+
+public:
+ static char ID;
+
+ SIPreEmitPeephole() : MachineFunctionPass(ID) {
+ initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
+ "SI peephole optimizations", false, false)
+
+char SIPreEmitPeephole::ID = 0;
+
+char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
+
+bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
+ // Match:
+ // sreg = -1 or 0
+ // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
+ // S_CBRANCH_VCC[N]Z
+ // =>
+ // S_CBRANCH_EXEC[N]Z
+ // We end up with this pattern sometimes after basic block placement.
+ // It happens while combining a block which assigns -1 or 0 to a saved mask
+ // and another block which consumes that saved mask and then a branch.
+ bool Changed = false;
+ MachineBasicBlock &MBB = *MI.getParent();
+ const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ const bool IsWave32 = ST.isWave32();
+ const unsigned CondReg = TRI->getVCC();
+ const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+
+ MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
+ E = MBB.rend();
+ bool ReadsCond = false;
+ unsigned Threshold = 5;
+ for (++A; A != E; ++A) {
+ if (!--Threshold)
+ return false;
+ if (A->modifiesRegister(ExecReg, TRI))
+ return false;
+ if (A->modifiesRegister(CondReg, TRI)) {
+ if (!A->definesRegister(CondReg, TRI) ||
+ (A->getOpcode() != And && A->getOpcode() != AndN2))
+ return false;
+ break;
+ }
+ ReadsCond |= A->readsRegister(CondReg, TRI);
+ }
+ if (A == E)
+ return false;
+
+ MachineOperand &Op1 = A->getOperand(1);
+ MachineOperand &Op2 = A->getOperand(2);
+ if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
+ TII->commuteInstruction(*A);
+ Changed = true;
+ }
+ if (Op1.getReg() != ExecReg)
+ return Changed;
+ if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
+ return Changed;
+
+ int64_t MaskValue = 0;
+ Register SReg;
+ if (Op2.isReg()) {
+ SReg = Op2.getReg();
+ auto M = std::next(A);
+ bool ReadsSreg = false;
+ for (; M != E; ++M) {
+ if (M->definesRegister(SReg, TRI))
+ break;
+ if (M->modifiesRegister(SReg, TRI))
+ return Changed;
+ ReadsSreg |= M->readsRegister(SReg, TRI);
+ }
+ if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
+ (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
+ return Changed;
+ MaskValue = M->getOperand(1).getImm();
+ // First if sreg is only used in the AND instruction fold the immediate
+ // into into the AND.
+ if (!ReadsSreg && Op2.isKill()) {
+ A->getOperand(2).ChangeToImmediate(MaskValue);
+ M->eraseFromParent();
+ }
+ } else if (Op2.isImm()) {
+ MaskValue = Op2.getImm();
+ } else {
+ llvm_unreachable("Op2 must be register or immediate");
+ }
+
+ // Invert mask for s_andn2
+ assert(MaskValue == 0 || MaskValue == -1);
+ if (A->getOpcode() == AndN2)
+ MaskValue = ~MaskValue;
+
+ if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
+ MI.killsRegister(CondReg, TRI))
+ A->eraseFromParent();
+
+ bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
+ if (SReg == ExecReg) {
+ // EXEC is updated directly
+ if (IsVCCZ) {
+ MI.eraseFromParent();
+ return true;
+ }
+ MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+ } else if (IsVCCZ && MaskValue == 0) {
+ // Will always branch
+ // Remove all succesors shadowed by new unconditional branch
+ MachineBasicBlock *Parent = MI.getParent();
+ SmallVector<MachineInstr *, 4> ToRemove;
+ bool Found = false;
+ for (MachineInstr &Term : Parent->terminators()) {
+ if (Found) {
+ if (Term.isBranch())
+ ToRemove.push_back(&Term);
+ } else {
+ Found = Term.isIdenticalTo(MI);
+ }
+ }
+ assert(Found && "conditional branch is not terminator");
+ for (auto BranchMI : ToRemove) {
+ MachineOperand &Dst = BranchMI->getOperand(0);
+ assert(Dst.isMBB() && "destination is not basic block");
+ Parent->removeSuccessor(Dst.getMBB());
+ BranchMI->eraseFromParent();
+ }
+
+ if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
+ Parent->removeSuccessor(Succ);
+ }
+
+ // Rewrite to unconditional branch
+ MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+ } else if (!IsVCCZ && MaskValue == 0) {
+ // Will never branch
+ MachineOperand &Dst = MI.getOperand(0);
+ assert(Dst.isMBB() && "destination is not basic block");
+ MI.getParent()->removeSuccessor(Dst.getMBB());
+ MI.eraseFromParent();
+ return true;
+ } else if (MaskValue == -1) {
+ // Depends only on EXEC
+ MI.setDesc(
+ TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
+ }
+
+ MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
+ MI.addImplicitDefUseOperands(*MBB.getParent());
+
+ return true;
+}
+
+bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
+ MachineInstr &MI) const {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const MachineFunction &MF = *MBB.getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
+ SmallVector<MachineInstr *, 4> ToRemove;
+ bool IdxOn = true;
+
+ if (!MI.isIdenticalTo(First))
+ return false;
+
+ // Scan back to find an identical S_SET_GPR_IDX_ON
+ for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
+ E = MI.getIterator(); I != E; ++I) {
+ switch (I->getOpcode()) {
+ case AMDGPU::S_SET_GPR_IDX_MODE:
+ return false;
+ case AMDGPU::S_SET_GPR_IDX_OFF:
+ IdxOn = false;
+ ToRemove.push_back(&*I);
+ break;
+ default:
+ if (I->modifiesRegister(AMDGPU::M0, TRI))
+ return false;
+ if (IdxReg && I->modifiesRegister(IdxReg, TRI))
+ return false;
+ if (llvm::any_of(I->operands(),
+ [&MRI, this](const MachineOperand &MO) {
+ return MO.isReg() &&
+ TRI->isVectorRegister(MRI, MO.getReg());
+ })) {
+ // The only exception allowed here is another indirect vector move
+ // with the same mode.
+ if (!IdxOn ||
+ !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+ I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
+ I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
+ return false;
+ }
+ }
+ }
+
+ MI.eraseFromParent();
+ for (MachineInstr *RI : ToRemove)
+ RI->eraseFromParent();
+ return true;
+}
+
+bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
+ if (MBBE != MBB.end()) {
+ MachineInstr &MI = *MBBE;
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_CBRANCH_VCCZ:
+ case AMDGPU::S_CBRANCH_VCCNZ:
+ Changed |= optimizeVccBranch(MI);
+ continue;
+ case AMDGPU::SI_RETURN_TO_EPILOG:
+ // FIXME: This is not an optimization and should be
+ // moved somewhere else.
+ assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+ // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+ // because external bytecode will be appended at the end.
+ if (&MBB != &MF.back() || &MI != &MBB.back()) {
+ // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
+ // at the end and jump there.
+ if (!EmptyMBBAtEnd) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ MBB.addSuccessor(EmptyMBBAtEnd);
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(EmptyMBBAtEnd);
+ MI.eraseFromParent();
+ MBBE = MBB.getFirstTerminator();
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!ST.hasVGPRIndexMode())
+ continue;
+
+ MachineInstr *SetGPRMI = nullptr;
+ const unsigned Threshold = 20;
+ unsigned Count = 0;
+ // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
+ // second is not needed. Do expensive checks in the optimizeSetGPR()
+ // and limit the distance to 20 instructions for compile time purposes.
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
+ MachineInstr &MI = *MBBI;
+ ++MBBI;
+
+ if (Count == Threshold)
+ SetGPRMI = nullptr;
+ else
+ ++Count;
+
+ if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
+ continue;
+
+ Count = 0;
+ if (!SetGPRMI) {
+ SetGPRMI = &MI;
+ continue;
+ }
+
+ if (optimizeSetGPR(*SetGPRMI, MI))
+ Changed = true;
+ else
+ SetGPRMI = &MI;
+ }
+ }
+
+ return Changed;
+}