diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 210 |
1 files changed, 202 insertions, 8 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 396d22c7ec18..fee900b3efb2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -338,8 +338,8 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) return false; - Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); - Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); + Offset0 = Off0->getAsZExtVal(); + Offset1 = Off1->getAsZExtVal(); return true; } @@ -2475,6 +2475,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + + case AMDGPU::S_MUL_U64_U32_PSEUDO: + case AMDGPU::S_MUL_I64_I32_PSEUDO: + MI.setDesc(get(AMDGPU::S_MUL_U64)); + break; } return true; } @@ -4153,15 +4158,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - return (isInt<16>(Imm) || isUInt<16>(Imm)) && - AMDGPU::isInlinableIntLiteral((int16_t)Imm); + return AMDGPU::isInlinableLiteralV2I16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + return AMDGPU::isInlinableLiteralV2F16(Imm); case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. @@ -6845,6 +6850,21 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // Default handling break; } + + case AMDGPU::S_MUL_U64: + // Split s_mul_u64 in 32-bit vector multiplications. + splitScalarSMulU64(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; + + case AMDGPU::S_MUL_U64_U32_PSEUDO: + case AMDGPU::S_MUL_I64_I32_PSEUDO: + // This is a special case of s_mul_u64 where all the operands are either + // zero extended or sign extended. + splitScalarSMulPseudo(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; + case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); Inst.eraseFromParent(); @@ -7654,6 +7674,180 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } +// There is not a vector equivalent of s_mul_u64. For this reason, we need to +// split the s_mul_u64 in 32-bit vector multiplications. +void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist, + MachineInstr &Inst, + MachineDominatorTree *MDT) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + MachineBasicBlock::iterator MII = Inst; + + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); + const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); + const TargetRegisterClass *Src0SubRC = + RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src0SubRC)) + Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); + const TargetRegisterClass *Src1SubRC = + RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src1SubRC)) + Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); + + // First, we extract the low 32-bit and high 32-bit values from each of the + // operands. + MachineOperand Op0L = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand Op1L = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + MachineOperand Op0H = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); + MachineOperand Op1H = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + + // The multilication is done as follows: + // + // Op1H Op1L + // * Op0H Op0L + // -------------------- + // Op1H*Op0L Op1L*Op0L + // + Op1H*Op0H Op1L*Op0H + // ----------------------------------------- + // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L + // + // We drop Op1H*Op0H because the result of the multiplication is a 64-bit + // value and that would overflow. + // The low 32-bit value is Op1L*Op0L. + // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L). + + Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Op1L_Op0H = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg) + .add(Op1L) + .add(Op0H); + + Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Op1H_Op0L = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg) + .add(Op1H) + .add(Op0L); + + Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Carry = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg) + .add(Op1L) + .add(Op0L); + + MachineInstr *LoHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) + .add(Op1L) + .add(Op0L); + + Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg) + .addReg(Op1L_Op0H_Reg) + .addReg(Op1H_Op0L_Reg); + + MachineInstr *HiHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1) + .addReg(AddReg) + .addReg(CarryReg); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + legalizeOperands(*Op1L_Op0H, MDT); + legalizeOperands(*Op1H_Op0L, MDT); + legalizeOperands(*Carry, MDT); + legalizeOperands(*LoHalf, MDT); + legalizeOperands(*Add, MDT); + legalizeOperands(*HiHalf, MDT); + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + +// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector +// multiplications. +void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist, + MachineInstr &Inst, + MachineDominatorTree *MDT) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + MachineBasicBlock::iterator MII = Inst; + + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); + const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); + const TargetRegisterClass *Src0SubRC = + RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src0SubRC)) + Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); + const TargetRegisterClass *Src1SubRC = + RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src1SubRC)) + Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); + + // First, we extract the low 32-bit and high 32-bit values from each of the + // operands. + MachineOperand Op0L = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand Op1L = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + + unsigned Opc = Inst.getOpcode(); + unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO + ? AMDGPU::V_MUL_HI_U32_e64 + : AMDGPU::V_MUL_HI_I32_e64; + MachineInstr *HiHalf = + BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L); + + MachineInstr *LoHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) + .add(Op1L) + .add(Op0L); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + legalizeOperands(*HiHalf, MDT); + legalizeOperands(*LoHalf, MDT); + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT) const { |