summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/SIShrinkInstructions.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/SIShrinkInstructions.cpp')
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp152
1 files changed, 114 insertions, 38 deletions
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 4f0913fe62f23..6cba55300a8cd 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -31,10 +31,6 @@ STATISTIC(NumInstructionsShrunk,
STATISTIC(NumLiteralConstantsFolded,
"Number of literal constants folded into 32-bit instructions.");
-namespace llvm {
- void initializeSIShrinkInstructionsPass(PassRegistry&);
-}
-
using namespace llvm;
namespace {
@@ -61,10 +57,8 @@ public:
} // End anonymous namespace.
-INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE,
- "SI Lower il Copies", false, false)
-INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE,
- "SI Lower il Copies", false, false)
+INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
+ "SI Shrink Instructions", false, false)
char SIShrinkInstructions::ID = 0;
@@ -125,10 +119,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
return false;
- if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
- return false;
-
- return true;
+ return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
}
/// \brief This function checks \p MI for operands defined by a move immediate
@@ -181,31 +172,37 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
}
// We have failed to fold src0, so commute the instruction and try again.
- if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))
+ if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
foldImmediates(MI, TII, MRI, false);
}
// Copy MachineOperand with all flags except setting it as implicit.
-static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) {
- assert(!Orig.isImplicit());
- return MachineOperand::CreateReg(Orig.getReg(),
- Orig.isDef(),
- true,
- Orig.isKill(),
- Orig.isDead(),
- Orig.isUndef(),
- Orig.isEarlyClobber(),
- Orig.getSubReg(),
- Orig.isDebug(),
- Orig.isInternalRead());
+static void copyFlagsToImplicitVCC(MachineInstr &MI,
+ const MachineOperand &Orig) {
+
+ for (MachineOperand &Use : MI.implicit_operands()) {
+ if (Use.getReg() == AMDGPU::VCC) {
+ Use.setIsUndef(Orig.isUndef());
+ Use.setIsKill(Orig.isKill());
+ return;
+ }
+ }
+}
+
+static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+ return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
}
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
std::vector<unsigned> I1Defs;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -217,14 +214,94 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
+ if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
+ // If this has a literal constant source that is the same as the
+ // reversed bits of an inline immediate, replace with a bitreverse of
+ // that constant. This saves 4 bytes in the common case of materializing
+ // sign bits.
+
+ // Test if we are after regalloc. We only want to do this after any
+ // optimizations happen because this will confuse them.
+ // XXX - not exactly a check for post-regalloc run.
+ MachineOperand &Src = MI.getOperand(1);
+ if (Src.isImm() &&
+ TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
+ int64_t Imm = Src.getImm();
+ if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) {
+ int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm));
+ if (ReverseImm >= -16 && ReverseImm <= 64) {
+ MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
+ Src.setImm(ReverseImm);
+ continue;
+ }
+ }
+ }
+ }
+
+ // Combine adjacent s_nops to use the immediate operand encoding how long
+ // to wait.
+ //
+ // s_nop N
+ // s_nop M
+ // =>
+ // s_nop (N + M)
+ if (MI.getOpcode() == AMDGPU::S_NOP &&
+ Next != MBB.end() &&
+ (*Next).getOpcode() == AMDGPU::S_NOP) {
+
+ MachineInstr &NextMI = *Next;
+ // The instruction encodes the amount to wait with an offset of 1,
+ // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
+ // after adding.
+ uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
+ uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
+
+ // Make sure we don't overflow the bounds.
+ if (Nop0 + Nop1 <= 8) {
+ NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
+ MI.eraseFromParent();
+ }
+
+ continue;
+ }
+
+ // FIXME: We also need to consider movs of constant operands since
+ // immediate operands are not folded if they have more than one use, and
+ // the operand folding pass is unaware if the immediate will be free since
+ // it won't know if the src == dest constraint will end up being
+ // satisfied.
+ if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
+ MI.getOpcode() == AMDGPU::S_MUL_I32) {
+ const MachineOperand &Dest = MI.getOperand(0);
+ const MachineOperand &Src0 = MI.getOperand(1);
+ const MachineOperand &Src1 = MI.getOperand(2);
+
+ // FIXME: This could work better if hints worked with subregisters. If
+ // we have a vector add of a constant, we usually don't get the correct
+ // allocation due to the subregister usage.
+ if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) &&
+ Src0.isReg()) {
+ MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg());
+ continue;
+ }
+
+ if (Src0.isReg() && Src0.getReg() == Dest.getReg()) {
+ if (Src1.isImm() && isKImmOperand(TII, Src1)) {
+ unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
+ AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
+
+ MI.setDesc(TII->get(Opc));
+ MI.tieOperands(0, 1);
+ }
+ }
+ }
+
// Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
const MachineOperand &Src = MI.getOperand(1);
- if (Src.isImm()) {
- if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
- MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
- }
+ if (Src.isImm() && isKImmOperand(TII, Src))
+ MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
continue;
}
@@ -235,7 +312,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (!canShrink(MI, TII, TRI, MRI)) {
// Try commuting the instruction and see if that enables us to shrink
// it.
- if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
+ if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
!canShrink(MI, TII, TRI, MRI))
continue;
}
@@ -287,9 +364,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineInstrBuilder Inst32 =
BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
- // Add the dst operand if the 32-bit encoding also has an explicit $dst.
+ // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
// For VOPC instructions, this is replaced by an implicit def of vcc.
- int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst);
+ int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
if (Op32DstIdx != -1) {
// dst
Inst32.addOperand(MI.getOperand(0));
@@ -314,10 +391,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
Inst32.addOperand(*Src2);
} else {
// In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
- // replaced with an implicit read of vcc.
- assert(Src2->getReg() == AMDGPU::VCC &&
- "Unexpected missing register operand");
- Inst32.addOperand(copyRegOperandAsImplicit(*Src2));
+ // replaced with an implicit read of vcc. This was already added
+ // during the initial BuildMI, so find it to preserve the flags.
+ copyFlagsToImplicitVCC(*Inst32, *Src2);
}
}