diff options
Diffstat (limited to 'lib/Target/AMDGPU/SIShrinkInstructions.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SIShrinkInstructions.cpp | 152 |
1 files changed, 114 insertions, 38 deletions
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 4f0913fe62f23..6cba55300a8cd 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -31,10 +31,6 @@ STATISTIC(NumInstructionsShrunk, STATISTIC(NumLiteralConstantsFolded, "Number of literal constants folded into 32-bit instructions."); -namespace llvm { - void initializeSIShrinkInstructionsPass(PassRegistry&); -} - using namespace llvm; namespace { @@ -61,10 +57,8 @@ public: } // End anonymous namespace. -INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, - "SI Lower il Copies", false, false) -INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, - "SI Lower il Copies", false, false) +INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, + "SI Shrink Instructions", false, false) char SIShrinkInstructions::ID = 0; @@ -125,10 +119,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) return false; - if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) - return false; - - return true; + return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp); } /// \brief This function checks \p MI for operands defined by a move immediate @@ -181,31 +172,37 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, } // We have failed to fold src0, so commute the instruction and try again. - if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) + if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI)) foldImmediates(MI, TII, MRI, false); } // Copy MachineOperand with all flags except setting it as implicit. -static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) { - assert(!Orig.isImplicit()); - return MachineOperand::CreateReg(Orig.getReg(), - Orig.isDef(), - true, - Orig.isKill(), - Orig.isDead(), - Orig.isUndef(), - Orig.isEarlyClobber(), - Orig.getSubReg(), - Orig.isDebug(), - Orig.isInternalRead()); +static void copyFlagsToImplicitVCC(MachineInstr &MI, + const MachineOperand &Orig) { + + for (MachineOperand &Use : MI.implicit_operands()) { + if (Use.getReg() == AMDGPU::VCC) { + Use.setIsUndef(Orig.isUndef()); + Use.setIsKill(Orig.isKill()); + return; + } + } +} + +static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { + return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4); } bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); + std::vector<unsigned> I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -217,14 +214,94 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; + if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { + // If this has a literal constant source that is the same as the + // reversed bits of an inline immediate, replace with a bitreverse of + // that constant. This saves 4 bytes in the common case of materializing + // sign bits. + + // Test if we are after regalloc. We only want to do this after any + // optimizations happen because this will confuse them. + // XXX - not exactly a check for post-regalloc run. + MachineOperand &Src = MI.getOperand(1); + if (Src.isImm() && + TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) { + int64_t Imm = Src.getImm(); + if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) { + int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm)); + if (ReverseImm >= -16 && ReverseImm <= 64) { + MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); + Src.setImm(ReverseImm); + continue; + } + } + } + } + + // Combine adjacent s_nops to use the immediate operand encoding how long + // to wait. + // + // s_nop N + // s_nop M + // => + // s_nop (N + M) + if (MI.getOpcode() == AMDGPU::S_NOP && + Next != MBB.end() && + (*Next).getOpcode() == AMDGPU::S_NOP) { + + MachineInstr &NextMI = *Next; + // The instruction encodes the amount to wait with an offset of 1, + // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back + // after adding. + uint8_t Nop0 = MI.getOperand(0).getImm() + 1; + uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1; + + // Make sure we don't overflow the bounds. + if (Nop0 + Nop1 <= 8) { + NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1); + MI.eraseFromParent(); + } + + continue; + } + + // FIXME: We also need to consider movs of constant operands since + // immediate operands are not folded if they have more than one use, and + // the operand folding pass is unaware if the immediate will be free since + // it won't know if the src == dest constraint will end up being + // satisfied. + if (MI.getOpcode() == AMDGPU::S_ADD_I32 || + MI.getOpcode() == AMDGPU::S_MUL_I32) { + const MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Src0 = MI.getOperand(1); + const MachineOperand &Src1 = MI.getOperand(2); + + // FIXME: This could work better if hints worked with subregisters. If + // we have a vector add of a constant, we usually don't get the correct + // allocation due to the subregister usage. + if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) && + Src0.isReg()) { + MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg()); + continue; + } + + if (Src0.isReg() && Src0.getReg() == Dest.getReg()) { + if (Src1.isImm() && isKImmOperand(TII, Src1)) { + unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? + AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; + + MI.setDesc(TII->get(Opc)); + MI.tieOperands(0, 1); + } + } + } + // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. if (MI.getOpcode() == AMDGPU::S_MOV_B32) { const MachineOperand &Src = MI.getOperand(1); - if (Src.isImm()) { - if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) - MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); - } + if (Src.isImm() && isKImmOperand(TII, Src)) + MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); continue; } @@ -235,7 +312,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (!canShrink(MI, TII, TRI, MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. - if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || + if (!MI.isCommutable() || !TII->commuteInstruction(MI) || !canShrink(MI, TII, TRI, MRI)) continue; } @@ -287,9 +364,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); - // Add the dst operand if the 32-bit encoding also has an explicit $dst. + // Add the dst operand if the 32-bit encoding also has an explicit $vdst. // For VOPC instructions, this is replaced by an implicit def of vcc. - int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst); + int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); if (Op32DstIdx != -1) { // dst Inst32.addOperand(MI.getOperand(0)); @@ -314,10 +391,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { Inst32.addOperand(*Src2); } else { // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is - // replaced with an implicit read of vcc. - assert(Src2->getReg() == AMDGPU::VCC && - "Unexpected missing register operand"); - Inst32.addOperand(copyRegOperandAsImplicit(*Src2)); + // replaced with an implicit read of vcc. This was already added + // during the initial BuildMI, so find it to preserve the flags. + copyFlagsToImplicitVCC(*Inst32, *Src2); } } |