diff options
Diffstat (limited to 'lib/Target/AMDGPU/SIFoldOperands.cpp')
| -rw-r--r-- | lib/Target/AMDGPU/SIFoldOperands.cpp | 164 | 
1 files changed, 148 insertions, 16 deletions
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 338cabcb906bc..f4e8669583699 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -35,13 +35,16 @@ struct FoldCandidate {      uint64_t ImmToFold;      int FrameIndexToFold;    }; +  int ShrinkOpcode;    unsigned char UseOpNo;    MachineOperand::MachineOperandType Kind;    bool Commuted;    FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, -                bool Commuted_ = false) : -    UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()), +                bool Commuted_ = false, +                int ShrinkOp = -1) : +    UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), +    Kind(FoldOp->getType()),      Commuted(Commuted_) {      if (FoldOp->isImm()) {        ImmToFold = FoldOp->getImm(); @@ -68,6 +71,14 @@ struct FoldCandidate {    bool isCommuted() const {      return Commuted;    } + +  bool needsShrink() const { +    return ShrinkOpcode != -1; +  } + +  int getShrinkOpcode() const { +    return ShrinkOpcode; +  }  };  class SIFoldOperands : public MachineFunctionPass { @@ -154,6 +165,7 @@ FunctionPass *llvm::createSIFoldOperandsPass() {  }  static bool updateOperand(FoldCandidate &Fold, +                          const SIInstrInfo &TII,                            const TargetRegisterInfo &TRI) {    MachineInstr *MI = Fold.UseMI;    MachineOperand &Old = MI->getOperand(Fold.UseOpNo); @@ -189,10 +201,49 @@ static bool updateOperand(FoldCandidate &Fold,          Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);        }      } + +    if (Fold.needsShrink()) { +      MachineBasicBlock *MBB = MI->getParent(); +      auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); +      if (Liveness != MachineBasicBlock::LQR_Dead) +        return false; + +      MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); +      int Op32 = Fold.getShrinkOpcode(); +      MachineOperand &Dst0 = MI->getOperand(0); +      MachineOperand &Dst1 = MI->getOperand(1); +      assert(Dst0.isDef() && Dst1.isDef()); + +      bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); + +      const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); +      unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); +      const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg()); +      unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC); + +      MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); + +      if (HaveNonDbgCarryUse) { +        BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg()) +          .addReg(AMDGPU::VCC, RegState::Kill); +      } + +      // Keep the old instruction around to avoid breaking iterators, but +      // replace the outputs with dummy registers. +      Dst0.setReg(NewReg0); +      Dst1.setReg(NewReg1); + +      if (Fold.isCommuted()) +        TII.commuteInstruction(*Inst32, false); +      return true; +    } +      Old.ChangeToImmediate(Fold.ImmToFold);      return true;    } +  assert(!Fold.needsShrink() && "not handled"); +    if (Fold.isFI()) {      Old.ChangeToFrameIndex(Fold.FrameIndexToFold);      return true; @@ -261,6 +312,8 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,      if (isUseMIInFoldList(FoldList, MI))        return false; +    unsigned CommuteOpNo = OpNo; +      // Operand is not legal, so try to commute the instruction to      // see if this makes it possible to fold.      unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; @@ -269,11 +322,12 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,      if (CanCommute) {        if (CommuteIdx0 == OpNo) -        OpNo = CommuteIdx1; +        CommuteOpNo = CommuteIdx1;        else if (CommuteIdx1 == OpNo) -        OpNo = CommuteIdx0; +        CommuteOpNo = CommuteIdx0;      } +      // One of operands might be an Imm operand, and OpNo may refer to it after      // the call of commuteInstruction() below. Such situations are avoided      // here explicitly as OpNo must be a register operand to be a candidate @@ -286,12 +340,34 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,          !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))        return false; -    if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { +    if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { +      if ((Opc == AMDGPU::V_ADD_I32_e64 || +           Opc == AMDGPU::V_SUB_I32_e64 || +           Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME +          OpToFold->isImm()) { +        MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + +        // Verify the other operand is a VGPR, otherwise we would violate the +        // constant bus restriction. +        unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0; +        MachineOperand &OtherOp = MI->getOperand(OtherIdx); +        if (!OtherOp.isReg() || +            !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg())) +          return false; + +        assert(MI->getOperand(1).isDef()); + +        int Op32 =  AMDGPU::getVOPe32(Opc); +        FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true, +                                         Op32)); +        return true; +      } +        TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);        return false;      } -    FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true)); +    FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));      return true;    } @@ -362,8 +438,6 @@ void SIFoldOperands::foldOperand(    bool FoldingImm = OpToFold.isImm(); -  // In order to fold immediates into copies, we need to change the -  // copy to a MOV.    if (FoldingImm && UseMI->isCopy()) {      unsigned DestReg = UseMI->getOperand(0).getReg();      const TargetRegisterClass *DestRC @@ -371,6 +445,31 @@ void SIFoldOperands::foldOperand(        MRI->getRegClass(DestReg) :        TRI->getPhysRegClass(DestReg); +    unsigned SrcReg  = UseMI->getOperand(1).getReg(); +    if (TargetRegisterInfo::isVirtualRegister(DestReg) && +      TargetRegisterInfo::isVirtualRegister(SrcReg)) { +      const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); +      if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) { +        MachineRegisterInfo::use_iterator NextUse; +        SmallVector<FoldCandidate, 4> CopyUses; +        for (MachineRegisterInfo::use_iterator +          Use = MRI->use_begin(DestReg), E = MRI->use_end(); +          Use != E; Use = NextUse) { +          NextUse = std::next(Use); +          FoldCandidate FC = FoldCandidate(Use->getParent(), +           Use.getOperandNo(), &UseMI->getOperand(1)); +          CopyUses.push_back(FC); +       } +        for (auto & F : CopyUses) { +          foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, +           FoldList, CopiesToReplace); +        } +      } +    } + +    // In order to fold immediates into copies, we need to change the +    // copy to a MOV. +      unsigned MovOp = TII->getMovOpcode(DestRC);      if (MovOp == AMDGPU::COPY)        return; @@ -378,6 +477,20 @@ void SIFoldOperands::foldOperand(      UseMI->setDesc(TII->get(MovOp));      CopiesToReplace.push_back(UseMI);    } else { +    if (UseMI->isCopy() && OpToFold.isReg() && +        TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && +        TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) && +        TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && +        TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) && +        !UseMI->getOperand(1).getSubReg()) { +      UseMI->getOperand(1).setReg(OpToFold.getReg()); +      UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); +      UseMI->getOperand(1).setIsKill(false); +      CopiesToReplace.push_back(UseMI); +      OpToFold.setIsKill(false); +      return; +    } +      const MCInstrDesc &UseDesc = UseMI->getDesc();      // Don't fold into target independent nodes.  Target independent opcodes @@ -550,6 +663,19 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,    if (!Src0->isImm() && !Src1->isImm())      return false; +  if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) { +    if (Src0->isImm() && Src0->getImm() == 0) { +      // v_lshl_or_b32 0, X, Y -> copy Y +      // v_lshl_or_b32 0, X, K -> v_mov_b32 K +      bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg(); +      MI->RemoveOperand(Src1Idx); +      MI->RemoveOperand(Src0Idx); + +      MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32)); +      return true; +    } +  } +    // and k0, k1 -> v_mov_b32 (k0 & k1)    // or k0, k1 -> v_mov_b32 (k0 | k1)    // xor k0, k1 -> v_mov_b32 (k0 ^ k1) @@ -728,13 +854,17 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,      }    } else {      // Folding register. +    SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess;      for (MachineRegisterInfo::use_iterator             Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();           Use != E; ++Use) { -      MachineInstr *UseMI = Use->getParent(); +      UsesToProcess.push_back(Use); +    } +    for (auto U : UsesToProcess) { +      MachineInstr *UseMI = U->getParent(); -      foldOperand(OpToFold, UseMI, Use.getOperandNo(), -                  FoldList, CopiesToReplace); +      foldOperand(OpToFold, UseMI, U.getOperandNo(), +        FoldList, CopiesToReplace);      }    } @@ -744,7 +874,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,      Copy->addImplicitDefUseOperands(*MF);    for (FoldCandidate &Fold : FoldList) { -    if (updateOperand(Fold, *TRI)) { +    if (updateOperand(Fold, *TII, *TRI)) {        // Clear kill flags.        if (Fold.isReg()) {          assert(Fold.OpToFold && Fold.OpToFold->isReg()); @@ -981,9 +1111,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {    // omod is ignored by hardware if IEEE bit is enabled. omod also does not    // correctly handle signed zeros.    // -  // TODO: Check nsz on instructions when fast math flags are preserved to MI -  // level. -  bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath(); +  bool IsIEEEMode = ST->enableIEEEBit(MF); +  bool HasNSZ = MFI->hasNoSignedZerosFPMath();    for (MachineBasicBlock *MBB : depth_first(&MF)) {      MachineBasicBlock::iterator I, Next; @@ -994,7 +1123,10 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {        tryFoldInst(TII, &MI);        if (!TII->isFoldableCopy(MI)) { -        if (IsIEEEMode || !tryFoldOMod(MI)) +        // TODO: Omod might be OK if there is NSZ only on the source +        // instruction, and not the omod multiply. +        if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || +            !tryFoldOMod(MI))            tryFoldClamp(MI);          continue;        }  | 
