diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 210 | 
1 files changed, 202 insertions, 8 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 396d22c7ec18..fee900b3efb2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -338,8 +338,8 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,      if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))        return false; -    Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); -    Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); +    Offset0 = Off0->getAsZExtVal(); +    Offset1 = Off1->getAsZExtVal();      return true;    } @@ -2475,6 +2475,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {      MI.eraseFromParent();      break;    } + +  case AMDGPU::S_MUL_U64_U32_PSEUDO: +  case AMDGPU::S_MUL_I64_I32_PSEUDO: +    MI.setDesc(get(AMDGPU::S_MUL_U64)); +    break;    }    return true;  } @@ -4153,15 +4158,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,    case AMDGPU::OPERAND_REG_IMM_V2INT16:    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:    case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: -    return (isInt<16>(Imm) || isUInt<16>(Imm)) && -           AMDGPU::isInlinableIntLiteral((int16_t)Imm); +    return AMDGPU::isInlinableLiteralV2I16(Imm); +  case AMDGPU::OPERAND_REG_IMM_V2FP16: +  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: +  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: +    return AMDGPU::isInlinableLiteralV2F16(Imm);    case AMDGPU::OPERAND_REG_IMM_FP16:    case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:    case AMDGPU::OPERAND_REG_INLINE_C_FP16: -  case AMDGPU::OPERAND_REG_INLINE_AC_FP16: -  case AMDGPU::OPERAND_REG_IMM_V2FP16: -  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: -  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { +  case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {      if (isInt<16>(Imm) || isUInt<16>(Imm)) {        // A few special case instructions have 16-bit operands on subtargets        // where 16-bit instructions are not legal. @@ -6845,6 +6850,21 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,      // Default handling      break;    } + +  case AMDGPU::S_MUL_U64: +    // Split s_mul_u64 in 32-bit vector multiplications. +    splitScalarSMulU64(Worklist, Inst, MDT); +    Inst.eraseFromParent(); +    return; + +  case AMDGPU::S_MUL_U64_U32_PSEUDO: +  case AMDGPU::S_MUL_I64_I32_PSEUDO: +    // This is a special case of s_mul_u64 where all the operands are either +    // zero extended or sign extended. +    splitScalarSMulPseudo(Worklist, Inst, MDT); +    Inst.eraseFromParent(); +    return; +    case AMDGPU::S_AND_B64:      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);      Inst.eraseFromParent(); @@ -7654,6 +7674,180 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,    addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);  } +// There is not a vector equivalent of s_mul_u64. For this reason, we need to +// split the s_mul_u64 in 32-bit vector multiplications. +void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist, +                                     MachineInstr &Inst, +                                     MachineDominatorTree *MDT) const { +  MachineBasicBlock &MBB = *Inst.getParent(); +  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + +  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); +  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + +  MachineOperand &Dest = Inst.getOperand(0); +  MachineOperand &Src0 = Inst.getOperand(1); +  MachineOperand &Src1 = Inst.getOperand(2); +  const DebugLoc &DL = Inst.getDebugLoc(); +  MachineBasicBlock::iterator MII = Inst; + +  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); +  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); +  const TargetRegisterClass *Src0SubRC = +      RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); +  if (RI.isSGPRClass(Src0SubRC)) +    Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); +  const TargetRegisterClass *Src1SubRC = +      RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); +  if (RI.isSGPRClass(Src1SubRC)) +    Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); + +  // First, we extract the low 32-bit and high 32-bit values from each of the +  // operands. +  MachineOperand Op0L = +      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); +  MachineOperand Op1L = +      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); +  MachineOperand Op0H = +      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); +  MachineOperand Op1H = +      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + +  // The multilication is done as follows: +  // +  //                            Op1H  Op1L +  //                          * Op0H  Op0L +  //                       -------------------- +  //                       Op1H*Op0L  Op1L*Op0L +  //          + Op1H*Op0H  Op1L*Op0H +  // ----------------------------------------- +  // (Op1H*Op0L + Op1L*Op0H + carry)  Op1L*Op0L +  // +  //  We drop Op1H*Op0H because the result of the multiplication is a 64-bit +  //  value and that would overflow. +  //  The low 32-bit value is Op1L*Op0L. +  //  The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L). + +  Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  MachineInstr *Op1L_Op0H = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg) +          .add(Op1L) +          .add(Op0H); + +  Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  MachineInstr *Op1H_Op0L = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg) +          .add(Op1H) +          .add(Op0L); + +  Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  MachineInstr *Carry = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg) +          .add(Op1L) +          .add(Op0L); + +  MachineInstr *LoHalf = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) +          .add(Op1L) +          .add(Op0L); + +  Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg) +                          .addReg(Op1L_Op0H_Reg) +                          .addReg(Op1H_Op0L_Reg); + +  MachineInstr *HiHalf = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1) +          .addReg(AddReg) +          .addReg(CarryReg); + +  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) +      .addReg(DestSub0) +      .addImm(AMDGPU::sub0) +      .addReg(DestSub1) +      .addImm(AMDGPU::sub1); + +  MRI.replaceRegWith(Dest.getReg(), FullDestReg); + +  // Try to legalize the operands in case we need to swap the order to keep it +  // valid. +  legalizeOperands(*Op1L_Op0H, MDT); +  legalizeOperands(*Op1H_Op0L, MDT); +  legalizeOperands(*Carry, MDT); +  legalizeOperands(*LoHalf, MDT); +  legalizeOperands(*Add, MDT); +  legalizeOperands(*HiHalf, MDT); + +  // Move all users of this moved value. +  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + +// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector +// multiplications. +void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist, +                                        MachineInstr &Inst, +                                        MachineDominatorTree *MDT) const { +  MachineBasicBlock &MBB = *Inst.getParent(); +  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + +  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); +  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + +  MachineOperand &Dest = Inst.getOperand(0); +  MachineOperand &Src0 = Inst.getOperand(1); +  MachineOperand &Src1 = Inst.getOperand(2); +  const DebugLoc &DL = Inst.getDebugLoc(); +  MachineBasicBlock::iterator MII = Inst; + +  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); +  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); +  const TargetRegisterClass *Src0SubRC = +      RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); +  if (RI.isSGPRClass(Src0SubRC)) +    Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); +  const TargetRegisterClass *Src1SubRC = +      RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); +  if (RI.isSGPRClass(Src1SubRC)) +    Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); + +  // First, we extract the low 32-bit and high 32-bit values from each of the +  // operands. +  MachineOperand Op0L = +      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); +  MachineOperand Op1L = +      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + +  unsigned Opc = Inst.getOpcode(); +  unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO +                        ? AMDGPU::V_MUL_HI_U32_e64 +                        : AMDGPU::V_MUL_HI_I32_e64; +  MachineInstr *HiHalf = +      BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L); + +  MachineInstr *LoHalf = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) +          .add(Op1L) +          .add(Op0L); + +  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) +      .addReg(DestSub0) +      .addImm(AMDGPU::sub0) +      .addReg(DestSub1) +      .addImm(AMDGPU::sub1); + +  MRI.replaceRegWith(Dest.getReg(), FullDestReg); + +  // Try to legalize the operands in case we need to swap the order to keep it +  // valid. +  legalizeOperands(*HiHalf, MDT); +  legalizeOperands(*LoHalf, MDT); + +  // Move all users of this moved value. +  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} +  void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,                                             MachineInstr &Inst, unsigned Opcode,                                             MachineDominatorTree *MDT) const {  | 
