diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2017-05-29 16:25:25 +0000 | 
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2017-05-29 16:25:25 +0000 | 
| commit | ab44ce3d598882e51a25eb82eb7ae6308de85ae6 (patch) | |
| tree | 568d786a59d49bef961dcb9bd09d422701b9da5b /lib/Target/AMDGPU | |
| parent | b5630dbadf9a2a06754194387d6b0fd9962a67f1 (diff) | |
Diffstat (limited to 'lib/Target/AMDGPU')
30 files changed, 891 insertions, 200 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index b279bd61e180..e7ebb37a9d62 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -425,7 +425,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",     FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,     FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,     FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, -   FeatureFastFMAF32, FeatureDPP, +   FeatureFastFMAF32, FeatureSDWA, FeatureDPP,     FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts    ]  >; @@ -534,10 +534,12 @@ def AMDGPUAsmVariants {    int VOP3_ID = 1;    string SDWA = "SDWA";    int SDWA_ID = 2; +  string SDWA9 = "SDWA9"; +  int SDWA9_ID = 3;    string DPP = "DPP"; -  int DPP_ID = 3; +  int DPP_ID = 4;    string Disable = "Disable"; -  int Disable_ID = 4; +  int Disable_ID = 5;  }  def DefaultAMDGPUAsmParserVariant : AsmParserVariant { @@ -555,6 +557,12 @@ def SDWAAsmParserVariant : AsmParserVariant {    let Name = AMDGPUAsmVariants.SDWA;  } +def SDWA9AsmParserVariant : AsmParserVariant { +  let Variant = AMDGPUAsmVariants.SDWA9_ID; +  let Name = AMDGPUAsmVariants.SDWA9; +} + +  def DPPAsmParserVariant : AsmParserVariant {    let Variant = AMDGPUAsmVariants.DPP_ID;    let Name = AMDGPUAsmVariants.DPP; @@ -567,6 +575,7 @@ def AMDGPU : Target {    let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant,                                  VOP3AsmParserVariant,                                  SDWAAsmParserVariant, +                                SDWA9AsmParserVariant,                                  DPPAsmParserVariant];    let AssemblyWriters = [AMDGPUAsmWriter];  } @@ -607,7 +616,10 @@ def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,    AssemblerPredicate<"FeatureVOP3P">;  def HasSDWA : Predicate<"Subtarget->hasSDWA()">, -  AssemblerPredicate<"FeatureSDWA">; +  AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; + +def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, +  AssemblerPredicate<"FeatureSDWA,FeatureGFX9">;  def HasDPP : Predicate<"Subtarget->hasDPP()">,    AssemblerPredicate<"FeatureDPP">; diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 5ec46a8294c0..723e8a7b54e2 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -127,6 +127,29 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {    return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);  } +bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op) +{ +  assert(Op.getOpcode() == ISD::OR); + +  SDValue N0 = Op->getOperand(0); +  SDValue N1 = Op->getOperand(1); +  EVT VT = N0.getValueType(); + +  if (VT.isInteger() && !VT.isVector()) { +    KnownBits LHSKnown, RHSKnown; +    DAG.computeKnownBits(N0, LHSKnown); + +    if (LHSKnown.Zero.getBoolValue()) { +      DAG.computeKnownBits(N1, RHSKnown); + +      if (!(~RHSKnown.Zero & ~LHSKnown.Zero)) +        return true; +    } +  } + +  return false; +} +  AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,                                             const AMDGPUSubtarget &STI)      : TargetLowering(TM), Subtarget(&STI) { @@ -2596,8 +2619,6 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(  SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,                                                  DAGCombinerInfo &DCI) const {    EVT VT = N->getValueType(0); -  if (VT != MVT::i64) -    return SDValue();    ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));    if (!RHS) @@ -2618,6 +2639,8 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,    case ISD::SIGN_EXTEND:    case ISD::ANY_EXTEND: {      // shl (ext x) => zext (shl x), if shift does not overflow int +    if (VT != MVT::i64) +      break;      KnownBits Known;      SDValue X = LHS->getOperand(0);      DAG.computeKnownBits(X, Known); @@ -2628,8 +2651,23 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,      SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));      return DAG.getZExtOrTrunc(Shl, SL, VT);    } +  case ISD::OR:  if (!isOrEquivalentToAdd(DAG, LHS)) break; +  case ISD::ADD: { // Fall through from above +    // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) +    if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { +      SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), +                                SDValue(RHS, 0)); +      SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal, +                                    SDLoc(C2), VT); +      return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V); +    } +    break; +  }    } +  if (VT != MVT::i64) +    return SDValue(); +    // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))    // On some subtargets, 64-bit shift is a quarter rate instruction. In the @@ -3440,7 +3478,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,                                         DL);      } -    if ((OffsetVal + WidthVal) >= 32) { +    if ((OffsetVal + WidthVal) >= 32 && +        !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {        SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);        return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,                           BitsFrom, ShiftVal); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index fb2f15022d25..0d066cdbdff4 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -34,6 +34,9 @@ private:    /// compare.    SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const; +public: +  static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op); +  protected:    const AMDGPUSubtarget *Subtarget;    AMDGPUAS AMDGPUASI; diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 9de302994e68..57905be18813 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -36,6 +36,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {    setAction({G_CONSTANT, S32}, Legal);    setAction({G_CONSTANT, S64}, Legal); +  setAction({G_FCONSTANT, S32}, Legal); +    setAction({G_GEP, P1}, Legal);    setAction({G_GEP, P2}, Legal);    setAction({G_GEP, 1, S64}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 85184b363905..07f92918a43f 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -97,6 +97,9 @@ private:                                         Instruction *UseInst,                                         int OpIdx0, int OpIdx1) const; +  /// Check whether we have enough local memory for promotion. +  bool hasSufficientLocalMem(const Function &F); +  public:    static char ID; @@ -107,7 +110,7 @@ public:    StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } -  void handleAlloca(AllocaInst &I); +  bool handleAlloca(AllocaInst &I, bool SufficientLDS);    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.setPreservesCFG(); @@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);    if (!ST.isPromoteAllocaEnabled())      return false; -  AS = AMDGPU::getAMDGPUAS(*F.getParent()); - -  FunctionType *FTy = F.getFunctionType(); - -  // If the function has any arguments in the local address space, then it's -  // possible these arguments require the entire local memory space, so -  // we cannot use local memory in the pass. -  for (Type *ParamTy : FTy->params()) { -    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); -    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { -      LocalMemLimit = 0; -      DEBUG(dbgs() << "Function has local memory argument. Promoting to " -                      "local memory disabled.\n"); -      return false; -    } -  } - -  LocalMemLimit = ST.getLocalMemorySize(); -  if (LocalMemLimit == 0) -    return false; - -  const DataLayout &DL = Mod->getDataLayout(); - -  // Check how much local memory is being used by global objects -  CurrentLocalMemUsage = 0; -  for (GlobalVariable &GV : Mod->globals()) { -    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) -      continue; - -    for (const User *U : GV.users()) { -      const Instruction *Use = dyn_cast<Instruction>(U); -      if (!Use) -        continue; - -      if (Use->getParent()->getParent() == &F) { -        unsigned Align = GV.getAlignment(); -        if (Align == 0) -          Align = DL.getABITypeAlignment(GV.getValueType()); -        // FIXME: Try to account for padding here. The padding is currently -        // determined from the inverse order of uses in the function. I'm not -        // sure if the use list order is in any way connected to this, so the -        // total reported size is likely incorrect. -        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); -        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); -        CurrentLocalMemUsage += AllocSize; -        break; -      } -    } -  } - -  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, -                                                          F); - -  // Restrict local memory usage so that we don't drastically reduce occupancy, -  // unless it is already significantly reduced. - -  // TODO: Have some sort of hint or other heuristics to guess occupancy based -  // on other factors.. -  unsigned OccupancyHint = ST.getWavesPerEU(F).second; -  if (OccupancyHint == 0) -    OccupancyHint = 7; - -  // Clamp to max value. -  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); - -  // Check the hint but ignore it if it's obviously wrong from the existing LDS -  // usage. -  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); - - -  // Round up to the next tier of usage. -  unsigned MaxSizeWithWaveCount -    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); - -  // Program is possibly broken by using more local mem than available. -  if (CurrentLocalMemUsage > MaxSizeWithWaveCount) -    return false; - -  LocalMemLimit = MaxSizeWithWaveCount; - -  DEBUG( -    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" -    << "  Rounding size to " << MaxSizeWithWaveCount -    << " with a maximum occupancy of " << MaxOccupancy << '\n' -    << " and " << (LocalMemLimit - CurrentLocalMemUsage) -    << " available for promotion\n" -  ); +  AS = AMDGPU::getAMDGPUAS(*F.getParent()); +  bool SufficientLDS = hasSufficientLocalMem(F); +  bool Changed = false;    BasicBlock &EntryBB = *F.begin();    for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {      AllocaInst *AI = dyn_cast<AllocaInst>(I);      ++I;      if (AI) -      handleAlloca(*AI); +      Changed |= handleAlloca(*AI, SufficientLDS);    } -  return true; +  return Changed;  }  std::pair<Value *, Value *> @@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(    return true;  } +bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { + +  FunctionType *FTy = F.getFunctionType(); +  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + +  // If the function has any arguments in the local address space, then it's +  // possible these arguments require the entire local memory space, so +  // we cannot use local memory in the pass. +  for (Type *ParamTy : FTy->params()) { +    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); +    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { +      LocalMemLimit = 0; +      DEBUG(dbgs() << "Function has local memory argument. Promoting to " +                      "local memory disabled.\n"); +      return false; +    } +  } + +  LocalMemLimit = ST.getLocalMemorySize(); +  if (LocalMemLimit == 0) +    return false; + +  const DataLayout &DL = Mod->getDataLayout(); + +  // Check how much local memory is being used by global objects +  CurrentLocalMemUsage = 0; +  for (GlobalVariable &GV : Mod->globals()) { +    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) +      continue; + +    for (const User *U : GV.users()) { +      const Instruction *Use = dyn_cast<Instruction>(U); +      if (!Use) +        continue; + +      if (Use->getParent()->getParent() == &F) { +        unsigned Align = GV.getAlignment(); +        if (Align == 0) +          Align = DL.getABITypeAlignment(GV.getValueType()); + +        // FIXME: Try to account for padding here. The padding is currently +        // determined from the inverse order of uses in the function. I'm not +        // sure if the use list order is in any way connected to this, so the +        // total reported size is likely incorrect. +        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); +        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); +        CurrentLocalMemUsage += AllocSize; +        break; +      } +    } +  } + +  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, +                                                          F); + +  // Restrict local memory usage so that we don't drastically reduce occupancy, +  // unless it is already significantly reduced. + +  // TODO: Have some sort of hint or other heuristics to guess occupancy based +  // on other factors.. +  unsigned OccupancyHint = ST.getWavesPerEU(F).second; +  if (OccupancyHint == 0) +    OccupancyHint = 7; + +  // Clamp to max value. +  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); + +  // Check the hint but ignore it if it's obviously wrong from the existing LDS +  // usage. +  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + + +  // Round up to the next tier of usage. +  unsigned MaxSizeWithWaveCount +    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); + +  // Program is possibly broken by using more local mem than available. +  if (CurrentLocalMemUsage > MaxSizeWithWaveCount) +    return false; + +  LocalMemLimit = MaxSizeWithWaveCount; + +  DEBUG( +    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" +    << "  Rounding size to " << MaxSizeWithWaveCount +    << " with a maximum occupancy of " << MaxOccupancy << '\n' +    << " and " << (LocalMemLimit - CurrentLocalMemUsage) +    << " available for promotion\n" +  ); + +  return true; +} +  // FIXME: Should try to pick the most likely to be profitable allocas first. -void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { +bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {    // Array allocations are probably not worth handling, since an allocation of    // the array type is the canonical form.    if (!I.isStaticAlloca() || I.isArrayAllocation()) -    return; +    return false;    IRBuilder<> Builder(&I); @@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {    DEBUG(dbgs() << "Trying to promote " << I << '\n'); -  if (tryPromoteAllocaToVector(&I, AS)) { -    DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); -    return; -  } +  if (tryPromoteAllocaToVector(&I, AS)) +    return true; // Promoted to vector.    const Function &ContainingFunction = *I.getParent()->getParent();    CallingConv::ID CC = ContainingFunction.getCallingConv(); @@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {      break;    default:      DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n"); -    return; +    return false;    } +  // Not likely to have sufficient local memory for promotion. +  if (!SufficientLDS) +    return false; +    const AMDGPUSubtarget &ST =      TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);    unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; @@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {    if (NewSize > LocalMemLimit) {      DEBUG(dbgs() << "  " << AllocSize            << " bytes of local memory not available to promote\n"); -    return; +    return false;    }    CurrentLocalMemUsage = NewSize; @@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {    if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {      DEBUG(dbgs() << " Do not know how to convert all uses\n"); -    return; +    return false;    }    DEBUG(dbgs() << "Promoting alloca to local memory\n"); @@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {        llvm_unreachable("Don't know how to promote alloca intrinsic use.");      }    } +  return true;  }  FunctionPass *llvm::createAMDGPUPromoteAlloca() { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index e543cae07ada..660879426810 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -416,6 +416,10 @@ public:      return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;    } +  bool hasSDWA() const { +    return HasSDWA; +  } +    /// \brief Returns the offset in bytes from the start of the input buffer    ///        of the first explicit kernel argument.    unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { @@ -670,10 +674,6 @@ public:      return HasInv2PiInlineImm;    } -  bool hasSDWA() const { -    return HasSDWA; -  } -    bool hasDPP() const {      return HasDPP;    } diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b52ea2b3a2c6..f5541e08e1b7 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -881,6 +881,10 @@ public:      return AMDGPU::isVI(getSTI());    } +  bool isGFX9() const { +    return AMDGPU::isGFX9(getSTI()); +  } +    bool hasInv2PiInlineImm() const {      return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];    } @@ -989,7 +993,6 @@ private:    bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);    bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;    unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; -  bool isSGPR(unsigned Reg);  public:    OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -1042,9 +1045,10 @@ public:    OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands);    void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);    void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands); +  void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands);    void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);    void cvtSDWA(MCInst &Inst, const OperandVector &Operands, -               uint64_t BasicInstType); +                uint64_t BasicInstType, bool skipVcc = false);  };  struct OptionalOperand { @@ -1966,7 +1970,8 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {    }    if (isForcedSDWA()) { -    static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA}; +    static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA, +                                        AMDGPUAsmVariants::SDWA9};      return makeArrayRef(Variants);    } @@ -1977,7 +1982,7 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {    static const unsigned Variants[] = {      AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3, -    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP +    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP    };    return makeArrayRef(Variants); @@ -2000,14 +2005,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {    return AMDGPU::NoRegister;  } -bool AMDGPUAsmParser::isSGPR(unsigned Reg) { -  const MCRegisterInfo *TRI = getContext().getRegisterInfo(); -  const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); -  const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); -  return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || -         Reg == AMDGPU::SCC; -} -  // NB: This code is correct only when used to check constant  // bus limitations because GFX7 support no f16 inline constants.  // Note that there are no cases when a GFX7 opcode violates @@ -2049,7 +2046,8 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {    if (MO.isImm()) {      return !isInlineConstant(Inst, OpIdx);    } -  return !MO.isReg() || isSGPR(mc2PseudoReg(MO.getReg())); +  return !MO.isReg() || +         isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());  }  bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) { @@ -2060,7 +2058,8 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {    if (Desc.TSFlags &        (SIInstrFlags::VOPC |         SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | -       SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) { +       SIInstrFlags::VOP3 | SIInstrFlags::VOP3P | +       SIInstrFlags::SDWA)) {      // Check special imm operands (used by madmk, etc)      if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) { @@ -4151,14 +4150,19 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {    cvtSDWA(Inst, Operands, SIInstrFlags::VOP2);  } +void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) { +  cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true); +} +  void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { -  cvtSDWA(Inst, Operands, SIInstrFlags::VOPC); +  cvtSDWA(Inst, Operands, SIInstrFlags::VOPC, isVI());  }  void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, -                              uint64_t BasicInstType) { +                              uint64_t BasicInstType, bool skipVcc) {    using namespace llvm::AMDGPU::SDWA;    OptionalImmIndexMap OptionalIdx; +  bool skippedVcc = false;    unsigned I = 1;    const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); @@ -4168,15 +4172,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,    for (unsigned E = Operands.size(); I != E; ++I) {      AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); -    // Add the register arguments -    if ((BasicInstType == SIInstrFlags::VOPC || -         BasicInstType == SIInstrFlags::VOP2)&& -        Op.isReg() && -        Op.Reg.RegNo == AMDGPU::VCC) { -      // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. -      // Skip it. -      continue; -    } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { +    if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { +      // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. +      // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3) +      // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand. +      // Skip VCC only if we didn't skip it on previous iteration. +      if (BasicInstType == SIInstrFlags::VOP2 && +          (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) { +        skippedVcc = true; +        continue; +      } else if (BasicInstType == SIInstrFlags::VOPC && +                 Inst.getNumOperands() == 0) { +        skippedVcc = true; +        continue; +      } +    } +    if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {        Op.addRegWithInputModsOperands(Inst, 2);      } else if (Op.isImm()) {        // Handle optional arguments @@ -4184,20 +4195,30 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,      } else {        llvm_unreachable("Invalid operand type");      } +    skippedVcc = false;    } -  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - -  if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { +  if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && +      Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {      // V_NOP_sdwa_vi has no optional sdwa arguments      switch (BasicInstType) {      case SIInstrFlags::VOP1: +      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); +      if (isGFX9() && +          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { +        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); +      }        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);        break;      case SIInstrFlags::VOP2: +      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); +      if (isGFX9() && +          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { +        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); +      }        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); @@ -4205,6 +4226,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,        break;      case SIInstrFlags::VOPC: +      if (isVI()) { +        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); +      }        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);        break; @@ -4220,10 +4244,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,        Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi)  {      auto it = Inst.begin();      std::advance( -        it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); +      it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));      Inst.insert(it, Inst.getOperand(0)); // src2 = dst    } -  }  /// Force static initialization. diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 137b5cca96ce..9b3cde7c4df6 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -62,32 +62,33 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,    return addOperand(Inst, MCOperand::createImm(Imm));  } -#define DECODE_OPERAND2(RegClass, DecName) \ -static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \ -                                                    unsigned Imm, \ -                                                    uint64_t /*Addr*/, \ -                                                    const void *Decoder) { \ +#define DECODE_OPERAND(StaticDecoderName, DecoderName) \ +static DecodeStatus StaticDecoderName(MCInst &Inst, \ +                                       unsigned Imm, \ +                                       uint64_t /*Addr*/, \ +                                       const void *Decoder) { \    auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \ -  return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \ +  return addOperand(Inst, DAsm->DecoderName(Imm)); \  } -#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass) +#define DECODE_OPERAND_REG(RegClass) \ +DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) -DECODE_OPERAND(VGPR_32) -DECODE_OPERAND(VS_32) -DECODE_OPERAND(VS_64) +DECODE_OPERAND_REG(VGPR_32) +DECODE_OPERAND_REG(VS_32) +DECODE_OPERAND_REG(VS_64) -DECODE_OPERAND(VReg_64) -DECODE_OPERAND(VReg_96) -DECODE_OPERAND(VReg_128) +DECODE_OPERAND_REG(VReg_64) +DECODE_OPERAND_REG(VReg_96) +DECODE_OPERAND_REG(VReg_128) -DECODE_OPERAND(SReg_32) -DECODE_OPERAND(SReg_32_XM0_XEXEC) -DECODE_OPERAND(SReg_64) -DECODE_OPERAND(SReg_64_XEXEC) -DECODE_OPERAND(SReg_128) -DECODE_OPERAND(SReg_256) -DECODE_OPERAND(SReg_512) +DECODE_OPERAND_REG(SReg_32) +DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) +DECODE_OPERAND_REG(SReg_64) +DECODE_OPERAND_REG(SReg_64_XEXEC) +DECODE_OPERAND_REG(SReg_128) +DECODE_OPERAND_REG(SReg_256) +DECODE_OPERAND_REG(SReg_512)  static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, @@ -106,6 +107,13 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,    return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));  } +#define DECODE_SDWA9(DecName) \ +DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName) + +DECODE_SDWA9(Src32) +DECODE_SDWA9(Src16) +DECODE_SDWA9(VopcDst) +  #include "AMDGPUGenDisassemblerTables.inc"  //===----------------------------------------------------------------------===// @@ -164,6 +172,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,        Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);        if (Res) break; + +      Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); +      if (Res) break;      }      // Reinitialize Bytes as DPP64 could have eaten too much @@ -582,6 +593,48 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {    return errOperand(Val, "unknown operand encoding " + Twine(Val));  } +MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width, +                                             unsigned Val) const { +  using namespace AMDGPU::SDWA; + +  if (SDWA9EncValues::SRC_VGPR_MIN <= Val && +      Val <= SDWA9EncValues::SRC_VGPR_MAX) { +    return createRegOperand(getVgprClassId(Width), +                            Val - SDWA9EncValues::SRC_VGPR_MIN); +  }  +  if (SDWA9EncValues::SRC_SGPR_MIN <= Val && +      Val <= SDWA9EncValues::SRC_SGPR_MAX) { +    return createSRegOperand(getSgprClassId(Width), +                             Val - SDWA9EncValues::SRC_SGPR_MIN); +  } + +  return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); +} + +MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const { +  return decodeSDWA9Src(OPW16, Val); +} + +MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const { +  return decodeSDWA9Src(OPW32, Val); +} + + +MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const { +  using namespace AMDGPU::SDWA; + +  if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { +    Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; +    if (Val > AMDGPU::EncValues::SGPR_MAX) { +      return decodeSpecialReg64(Val); +    } else { +      return createSRegOperand(getSgprClassId(OPW64), Val); +    } +  } else { +    return createRegOperand(AMDGPU::VCC); +  } +} +  //===----------------------------------------------------------------------===//  // AMDGPUSymbolizer  //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 620bae0a6d1a..0ff405a71e9b 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -104,6 +104,11 @@ public:    MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;    MCOperand decodeSpecialReg32(unsigned Val) const;    MCOperand decodeSpecialReg64(unsigned Val) const; + +  MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const; +  MCOperand decodeSDWA9Src16(unsigned Val) const; +  MCOperand decodeSDWA9Src32(unsigned Val) const; +  MCOperand decodeSDWA9VopcDst(unsigned Val) const;  };  //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 3bb5c9bc22b7..8ead48067336 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -191,6 +191,7 @@ public:    }  }; +namespace {  // just a stub to make base class happy  class SchedStrategyStub : public MachineSchedStrategy {  public: @@ -202,6 +203,7 @@ public:    void releaseTopNode(SUnit *SU) override {}    void releaseBottomNode(SUnit *SU) override {}  }; +} // namespace  GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,                                               StrategyKind S) diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index c6d0f2179950..d378df674be9 100644 --- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -17,6 +17,7 @@ using namespace llvm;  #define DEBUG_TYPE "misched" +namespace {  class GCNMinRegScheduler {    struct Candidate : ilist_node<Candidate> {      const SUnit *SU; @@ -71,6 +72,7 @@ public:    std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,                                       const ScheduleDAG &DAG);  }; +} // namespace  void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {    NumPreds.resize(SUnits.size()); diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 18374dca3f84..390a8286c76a 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -211,9 +211,9 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO,    return getLiveLaneMask(MO.getReg(), SI, LIS, MRI);  } -SmallVector<RegisterMaskPair, 8> collectVirtualRegUses(const MachineInstr &MI, -                                              const LiveIntervals &LIS, -                                              const MachineRegisterInfo &MRI) { +static SmallVector<RegisterMaskPair, 8> +collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, +                      const MachineRegisterInfo &MRI) {    SmallVector<RegisterMaskPair, 8> Res;    for (const auto &MO : MI.operands()) {      if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 3d3858ab47ec..a856b17a228f 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -52,6 +52,18 @@ public:      return 0;    } +  virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, +                                     SmallVectorImpl<MCFixup> &Fixups, +                                     const MCSubtargetInfo &STI) const { +    return 0; +  } + +  virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, +                                       SmallVectorImpl<MCFixup> &Fixups, +                                       const MCSubtargetInfo &STI) const { +    return 0; +  } +  protected:    uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;    void verifyInstructionPredicates(const MCInst &MI, diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index bda0928036fd..e02acf516c0d 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -69,6 +69,14 @@ public:    unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,                               SmallVectorImpl<MCFixup> &Fixups,                               const MCSubtargetInfo &STI) const override; + +  unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, +                               SmallVectorImpl<MCFixup> &Fixups, +                               const MCSubtargetInfo &STI) const override; + +  unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, +                                   SmallVectorImpl<MCFixup> &Fixups, +                                   const MCSubtargetInfo &STI) const override;  };  } // end anonymous namespace @@ -319,6 +327,44 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,    return getMachineOpValue(MI, MO, Fixups, STI);  } +unsigned +SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, +                                     SmallVectorImpl<MCFixup> &Fixups, +                                     const MCSubtargetInfo &STI) const { +  using namespace AMDGPU::SDWA; +   +  uint64_t RegEnc = 0; + +  const MCOperand &MO = MI.getOperand(OpNo); + +  unsigned Reg = MO.getReg(); +  RegEnc |= MRI.getEncodingValue(Reg); +  RegEnc &= SDWA9EncValues::SRC_VGPR_MASK; +  if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { +    RegEnc |= SDWA9EncValues::SRC_SGPR_MASK; +  } +  return RegEnc; +} + +unsigned +SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, +                                         SmallVectorImpl<MCFixup> &Fixups, +                                         const MCSubtargetInfo &STI) const { +  using namespace AMDGPU::SDWA; + +  uint64_t RegEnc = 0; + +  const MCOperand &MO = MI.getOperand(OpNo); + +  unsigned Reg = MO.getReg(); +  if (Reg != AMDGPU::VCC) { +    RegEnc |= MRI.getEncodingValue(Reg); +    RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; +    RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; +  } +  return RegEnc; +} +  uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,                                              const MCOperand &MO,                                         SmallVectorImpl<MCFixup> &Fixups, diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 3590a9b05e1d..60b913cfd39a 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1618,6 +1618,14 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,     return VT.changeVectorElementTypeToInteger();  } +bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const { +  // Local and Private addresses do not handle vectors. Limit to i32 +  if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) { +    return (MemVT.getSizeInBits() <= 32); +  } +  return true; +} +  bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,                                                          unsigned AddrSpace,                                                          unsigned Align, diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 9700ce14c6f3..d6a0876a6ee7 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -44,6 +44,8 @@ public:    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,                           EVT VT) const override; +  bool canMergeStoresTo(unsigned AS, EVT MemVT) const override; +    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,                                        unsigned Align,                                        bool *IsFast) const override; diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td index cc667d985a82..3c1e8527284c 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.td +++ b/lib/Target/AMDGPU/R600RegisterInfo.td @@ -226,7 +226,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add      R600_Addr,      R600_KC0, R600_KC1,      ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, -    ALU_CONST, ALU_PARAM, OQAP +    ALU_CONST, ALU_PARAM, OQAP, INDIRECT_BASE_ADDR      )>;  def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index a01330cb9171..80967edee0ab 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -118,6 +118,10 @@ namespace AMDGPU {      // Operand for source modifiers for VOP instructions      OPERAND_INPUT_MODS, +    // Operand for GFX9 SDWA instructions +    OPERAND_SDWA9_SRC, +    OPERAND_SDWA9_VOPC_DST, +      /// Operand with 32-bit immediate that uses the constant bus.      OPERAND_KIMM32,      OPERAND_KIMM16 @@ -160,7 +164,8 @@ namespace AMDGPUAsmVariants {      DEFAULT = 0,      VOP3 = 1,      SDWA = 2, -    DPP = 3 +    SDWA9 = 3, +    DPP = 4    };  } @@ -294,6 +299,18 @@ enum DstUnused {    UNUSED_PRESERVE = 2,  }; +enum SDWA9EncValues{ +  SRC_SGPR_MASK = 0x100, +  SRC_VGPR_MASK = 0xFF, +  VOPC_DST_VCC_MASK = 0x80, +  VOPC_DST_SGPR_MASK = 0x7F, + +  SRC_VGPR_MIN = 0, +  SRC_VGPR_MAX = 255, +  SRC_SGPR_MIN = 256, +  SRC_SGPR_MAX = 357, +}; +  } // namespace SDWA  } // namespace AMDGPU diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 01c1f78e7ca4..76c2644867aa 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -698,6 +698,18 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,    }  } +bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const { +  if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { +    return (MemVT.getSizeInBits() <= 4 * 32); +  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { +    unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); +    return (MemVT.getSizeInBits() <= MaxPrivateBits); +  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { +    return (MemVT.getSizeInBits() <= 2 * 32); +  } +  return true; +} +  bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,                                                        unsigned AddrSpace,                                                        unsigned Align, @@ -4229,12 +4241,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,    SDValue RHS = N->getOperand(1); -  if (VT == MVT::i64) { -    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); -    if (CRHS) { -      if (SDValue Split -          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) -        return Split; +  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); +  if (VT == MVT::i64 && CRHS) { +    if (SDValue Split +        = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) +      return Split; +  } + +  if (CRHS && VT == MVT::i32) { +    // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb +    // nb = number of trailing zeroes in mask +    // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, +    // given that we are selecting 8 or 16 bit fields starting at byte boundary. +    uint64_t Mask = CRHS->getZExtValue(); +    unsigned Bits = countPopulation(Mask); +    if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && +        (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { +      if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { +        unsigned Shift = CShift->getZExtValue(); +        unsigned NB = CRHS->getAPIntValue().countTrailingZeros(); +        unsigned Offset = NB + Shift; +        if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. +          SDLoc SL(N); +          SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, +                                    LHS->getOperand(0), +                                    DAG.getConstant(Offset, SL, MVT::i32), +                                    DAG.getConstant(Bits, SL, MVT::i32)); +          EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); +          SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, +                                    DAG.getValueType(NarrowVT)); +          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, +                                    DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); +          return Shl; +        } +      }      }    } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index e68837747491..8e2ec40b224c 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -150,6 +150,8 @@ public:    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,                               unsigned AS) const override; +  bool canMergeStoresTo(unsigned AS, EVT MemVT) const override; +    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,                                        unsigned Align,                                        bool *IsFast) const override; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 38a16b525a75..36d29b8ecf06 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2331,6 +2331,10 @@ static bool isSubRegOf(const SIRegisterInfo &TRI,  bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,                                      StringRef &ErrInfo) const {    uint16_t Opcode = MI.getOpcode(); + +  if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) +    return true; +    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();    int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);    int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 7b052844f177..c5287c7f64ba 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -439,6 +439,27 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {    let ParserMatchClass = VReg32OrOffClass;  } +class SDWA9Src : RegisterOperand<VS_32> { +  let OperandNamespace = "AMDGPU"; +  let OperandType = "OPERAND_SDWA9_SRC"; +  let EncoderMethod = "getSDWA9SrcEncoding"; +} + +def SDWA9Src32 : SDWA9Src { +  let DecoderMethod = "decodeSDWA9Src32"; +} + +def SDWA9Src16 : SDWA9Src { +  let DecoderMethod = "decodeSDWA9Src16"; +} + +def SDWA9VopcDst : VOPDstOperand<SReg_64> { +  let OperandNamespace = "AMDGPU"; +  let OperandType = "OPERAND_SDWA9_VOPC_DST"; +  let EncoderMethod = "getSDWA9VopcDstEncoding"; +  let DecoderMethod = "decodeSDWA9VopcDst"; +} +  class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {    let Name = "Imm"#CName;    let PredicateMethod = "is"#CName; @@ -588,6 +609,16 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>  def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;  def Int64InputMods : IntInputMods<Int64InputModsMatchClass>; +def FPRegInputModsMatchClass : AsmOperandClass { +  let Name = "RegWithFPInputMods"; +  let ParserMethod = "parseRegWithFPInputMods"; +  let PredicateMethod = "isRegKind"; +} + +def FPRegInputMods : InputMods <FPRegInputModsMatchClass> { +  let PrintMethod = "printOperandAndFPInputMods"; +} +  def FPVRegInputModsMatchClass : AsmOperandClass {    let Name = "VRegWithFPInputMods";    let ParserMethod = "parseRegWithFPInputMods"; @@ -598,6 +629,17 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {    let PrintMethod = "printOperandAndFPInputMods";  } + +def IntRegInputModsMatchClass : AsmOperandClass { +  let Name = "RegWithIntInputMods"; +  let ParserMethod = "parseRegWithIntInputMods"; +  let PredicateMethod = "isRegKind"; +} + +def IntRegInputMods : InputMods <IntRegInputModsMatchClass> { +  let PrintMethod = "printOperandAndIntInputMods"; +} +  def IntVRegInputModsMatchClass : AsmOperandClass {    let Name = "VRegWithIntInputMods";    let ParserMethod = "parseRegWithIntInputMods"; @@ -783,6 +825,14 @@ class getVALUDstForVT<ValueType VT> {                                VOPDstOperand<SReg_64>)))); // else VT == i1  } +// Returns the register class to use for the destination of VOP[12C] +// instructions with GFX9 SDWA extension +class getSDWA9DstForVT<ValueType VT> { +  RegisterOperand ret = !if(!eq(VT.Size, 1), +                            SDWA9VopcDst, // VOPC +                            VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst +} +  // Returns the register class to use for source 0 of VOP[12C]  // instructions for the given VT.  class getVOPSrc0ForVT<ValueType VT> { @@ -823,6 +873,9 @@ class getVregSrcForVT<ValueType VT> {                          !if(!eq(VT.Size, 64), VReg_64, VGPR_32));  } +class getSDWA9SrcForVT <ValueType VT> { +  RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32); +}  // Returns the register class to use for sources of VOP3 instructions for the  // given VT. @@ -926,6 +979,15 @@ class getSrcModExt <ValueType VT> {    Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);  } +// Return type of input modifiers operand specified input operand for SDWA 9 +class getSrcModSDWA9 <ValueType VT> { +    bit isFP = !if(!eq(VT.Value, f16.Value), 1, +               !if(!eq(VT.Value, f32.Value), 1, +               !if(!eq(VT.Value, f64.Value), 1, +               0))); +  Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods); +} +  // Returns the input arguments for VOP[12C] instructions for the given SrcVT.  class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {    dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1 @@ -1062,6 +1124,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,                 // VOP1 without input operands (V_NOP)                 (ins),              !if(!eq(NumSrcArgs, 1), +               // VOP1_SDWA                 (ins Src0Mod:$src0_modifiers, Src0RC:$src0,                      clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,                      src0_sel:$src0_sel), @@ -1071,7 +1134,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,                    (ins Src0Mod:$src0_modifiers, Src0RC:$src0,                         Src1Mod:$src1_modifiers, Src1RC:$src1,                         clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), -                  // VOP2_SDWA or VOPC_SDWA with modifiers +                  // VOP2_SDWA with modifiers                    (ins Src0Mod:$src0_modifiers, Src0RC:$src0,                         Src1Mod:$src1_modifiers, Src1RC:$src1,                         clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, @@ -1079,12 +1142,65 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,              (ins)/* endif */)));  } +// Ins for GFX9 SDWA +class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs, +                   bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod, +                   ValueType DstVT> { + +  dag ret = !if(!eq(NumSrcArgs, 0), +               // VOP1 without input operands (V_NOP) +               (ins), +            !if(!eq(NumSrcArgs, 1), +               // VOP1 +               !if(!eq(HasSDWAOMod, 0), +                  // VOP1_SDWA9 without omod +                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0, +                       clampmod:$clamp, +                       dst_sel:$dst_sel, dst_unused:$dst_unused, +                       src0_sel:$src0_sel), +                  // VOP1_SDWA9 with omod +                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0, +                       clampmod:$clamp, omod:$omod, +                       dst_sel:$dst_sel, dst_unused:$dst_unused, +                       src0_sel:$src0_sel)), +            !if(!eq(NumSrcArgs, 2), +               !if(!eq(DstVT.Size, 1), +                  // VOPC_SDWA9 +                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0, +                       Src1Mod:$src1_modifiers, Src1RC:$src1, +                       src0_sel:$src0_sel, src1_sel:$src1_sel), +                  // VOP2_SDWA9 +                  !if(!eq(HasSDWAOMod, 0), +                     // VOP2_SDWA9 without omod +                     (ins Src0Mod:$src0_modifiers, Src0RC:$src0, +                          Src1Mod:$src1_modifiers, Src1RC:$src1, +                          clampmod:$clamp, +                          dst_sel:$dst_sel, dst_unused:$dst_unused, +                          src0_sel:$src0_sel, src1_sel:$src1_sel), +                     // VOP1_SDWA9 with omod +                     (ins Src0Mod:$src0_modifiers, Src0RC:$src0, +                          Src1Mod:$src1_modifiers, Src1RC:$src1, +                          clampmod:$clamp, omod:$omod, +                          dst_sel:$dst_sel, dst_unused:$dst_unused, +                          src0_sel:$src0_sel, src1_sel:$src1_sel))), +            (ins)/* endif */))); +} +  // Outs for DPP and SDWA -class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> { +class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {    dag ret = !if(HasDst,                  !if(!eq(DstVT.Size, 1),                      (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions -                    (outs DstRCDPP:$vdst)), +                    (outs DstRCExt:$vdst)), +                (outs)); // V_NOP +} + +// Outs for GFX9 SDWA +class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> { +  dag ret = !if(HasDst, +                !if(!eq(DstVT.Size, 1), +                    (outs DstRCSDWA9:$sdst), +                    (outs DstRCSDWA9:$vdst)),                  (outs)); // V_NOP  } @@ -1153,8 +1269,7 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =    string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";  } -class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers, -                  ValueType DstVT = i32> { +class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {    string dst = !if(HasDst,                     !if(!eq(DstVT.Size, 1),                         " vcc", // use vcc token as dst for VOPC instructioins @@ -1182,6 +1297,35 @@ class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,    string ret = dst#args#sdwa;  } +class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs, +                   ValueType DstVT = i32> { +  string dst = !if(HasDst, +                   !if(!eq(DstVT.Size, 1), +                       "$sdst", // VOPC +                       "$vdst"), // VOP1/2 +                    ""); +  string src0 = "$src0_modifiers"; +  string src1 = "$src1_modifiers"; +  string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod"); +  string args = !if(!eq(NumSrcArgs, 0), "", +                    !if(!eq(NumSrcArgs, 1), +                        ", "#src0, +                        ", "#src0#", "#src1 +                     ) +                ); +  string sdwa = !if(!eq(NumSrcArgs, 0), "", +                    !if(!eq(NumSrcArgs, 1), +                        out_mods#" $dst_sel $dst_unused $src0_sel", +                        !if(!eq(DstVT.Size, 1), +                            " $src0_sel $src1_sel", // No dst_sel, dst_unused and output modifiers for VOPC +                            out_mods#" $dst_sel $dst_unused $src0_sel $src1_sel" +                        ) +                    ) +                ); +  string ret = dst#args#sdwa; +} + +  // Function that checks if instruction supports DPP and SDWA  class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,                   ValueType Src1VT = i32> { @@ -1219,6 +1363,7 @@ class VOPProfile <list<ValueType> _ArgVT> {    field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;    field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;    field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret; +  field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret;    field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;    field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;    field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; @@ -1228,6 +1373,8 @@ class VOPProfile <list<ValueType> _ArgVT> {    field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;    field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;    field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret; +  field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret; +  field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;    field Operand Src0Mod = getSrcMod<Src0VT>.ret;    field Operand Src1Mod = getSrcMod<Src1VT>.ret;    field Operand Src2Mod = getSrcMod<Src2VT>.ret; @@ -1235,6 +1382,8 @@ class VOPProfile <list<ValueType> _ArgVT> {    field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;    field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;    field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret; +  field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret; +  field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret;    field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); @@ -1261,14 +1410,16 @@ class VOPProfile <list<ValueType> _ArgVT> {    field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);    field bit HasClamp = HasModifiers; -  field bit HasSDWAClamp = HasSrc0; +  field bit HasSDWAClamp = EmitDst;    field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;    field bit IsPacked = isPackedType<Src0VT>.ret;    field bit HasOpSel = IsPacked;    field bit HasOMod = !if(HasOpSel, 0, HasModifiers); +  field bit HasSDWAOMod = isFloatType<DstVT>.ret;    field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; +  field bit HasSDWA9 = HasExt;    field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);    field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); @@ -1282,6 +1433,7 @@ class VOPProfile <list<ValueType> _ArgVT> {    field dag Outs64 = Outs;    field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;    field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret; +  field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret;    field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;    field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, @@ -1296,16 +1448,21 @@ class VOPProfile <list<ValueType> _ArgVT> {    field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,                                   HasModifiers, Src0ModSDWA, Src1ModSDWA,                                   DstVT>.ret; +  field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs, +                                   HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9, +                                   DstVT>.ret;    field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;    field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;    field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;    field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; -  field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; +  field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret; +  field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;  }  class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {    let HasExt = 0; +  let HasSDWA9 = 0;  }  def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; @@ -1446,6 +1603,15 @@ def getSDWAOp : InstrMapping {    let ValueCols = [["SDWA"]];  } +// Maps ordinary instructions to their SDWA GFX9 counterparts +def getSDWA9Op : InstrMapping { +  let FilterClass = "VOP"; +  let RowFields = ["OpName"]; +  let ColFields = ["AsmVariantName"]; +  let KeyCol = ["Default"]; +  let ValueCols = [["SDWA9"]]; +} +  def getMaskedMIMGOp : InstrMapping {    let FilterClass = "MIMG_Mask";    let RowFields = ["Op"]; diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index f2d8b6f7b7a4..ec29a66c8bbb 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -184,7 +184,9 @@ def S_BITSET0_B32 : SOP1_32    <"s_bitset0_b32">;  def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">;  def S_BITSET1_B32 : SOP1_32    <"s_bitset1_b32">;  def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">; -def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64">; +def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64", +  [(set i64:$sdst, (int_amdgcn_s_getpc))] +>;  let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 2abd4afad3b6..630f469eabf0 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -544,6 +544,17 @@ bool isVI(const MCSubtargetInfo &STI) {    return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];  } +bool isGFX9(const MCSubtargetInfo &STI) { +  return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; +} + +bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { +  const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); +  const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); +  return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || +    Reg == AMDGPU::SCC; +} +  unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {    switch(Reg) { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 8e74aa2cc9a8..19888ad7556a 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -273,6 +273,10 @@ inline bool isKernel(CallingConv::ID CC) {  bool isSI(const MCSubtargetInfo &STI);  bool isCI(const MCSubtargetInfo &STI);  bool isVI(const MCSubtargetInfo &STI); +bool isGFX9(const MCSubtargetInfo &STI); + +/// \brief Is Reg - scalar register +bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);  /// If \p Reg is a pseudo reg, return the correct hardware register given  /// \p STI otherwise return \p Reg. diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 1febc6bf8ec2..95b5ef0a49db 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -30,6 +30,15 @@ class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {    let Inst{31-25} = 0x3f; // encoding  } +class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> { +  bits<8> vdst; + +  let Inst{8-0}   = 0xf9; // sdwa +  let Inst{16-9}  = op; +  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); +  let Inst{31-25} = 0x3f; // encoding +} +  class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :    InstSI <P.Outs32, P.Ins32, "", pattern>,    VOP <opName>, @@ -84,6 +93,11 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :    let AsmMatchConverter = "cvtSdwaVOP1";  } +class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : +  VOP_SDWA9_Pseudo <OpName, P, pattern> { +  let AsmMatchConverter = "cvtSdwaVOP1"; +} +  class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {    list<dag> ret =      !if(P.HasModifiers, @@ -103,6 +117,7 @@ multiclass VOP1Inst <string opName, VOPProfile P,    def _e32 : VOP1_Pseudo <opName, P>;    def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;    def _sdwa : VOP1_SDWA_Pseudo <opName, P>; +  def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>;  }  // Special profile for instructions which have clamp @@ -243,6 +258,7 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {    let Src0RC64 = VRegSrc_32;    let HasExt = 0; +  let HasSDWA9 = 0;  }  // Special case because there are no true output operands.  Hack vdst @@ -258,16 +274,21 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {    let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);    let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,                      bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); -  let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0, +  let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,                       clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,                       src0_sel:$src0_sel); +  let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, +                      clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, +                      src0_sel:$src0_sel);    let Asm32 = getAsm32<1, 1>.ret;    let Asm64 = getAsm64<1, 1, 0, 1>.ret;    let AsmDPP = getAsmDPP<1, 1, 0>.ret; -  let AsmSDWA = getAsmSDWA<1, 1, 0>.ret; +  let AsmSDWA = getAsmSDWA<1, 1>.ret; +  let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret;    let HasExt = 0; +  let HasSDWA9 = 0;    let HasDst = 0;    let EmitDst = 1; // force vdst emission  } @@ -324,7 +345,7 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;  } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in {  defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;  defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; @@ -347,7 +368,7 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;  } -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in {  def : Pat<      (f32 (f16_to_fp i16:$src)), @@ -523,6 +544,10 @@ multiclass VOP1_Real_vi <bits<10> op> {      VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,      VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; +  def _sdwa_gfx9 : +    VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>, +    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; +    // For now left dpp only for asm/dasm    // TODO: add corresponding pseudo    def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 4a11d9471f1d..657cacaa792c 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -48,6 +48,18 @@ class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {    let Inst{31}    = 0x0; // encoding  } +class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> { +  bits<8> vdst; +  bits<9> src1; + +  let Inst{8-0}   = 0xf9; // sdwa +  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0); +  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); +  let Inst{30-25} = op; +  let Inst{31}    = 0x0; // encoding +  let Inst{63}    = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr +} +  class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :    InstSI <P.Outs32, P.Ins32, "", pattern>,    VOP <opName>, @@ -102,6 +114,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :    let AsmMatchConverter = "cvtSdwaVOP2";  } +class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : +  VOP_SDWA9_Pseudo <OpName, P, pattern> { +  let AsmMatchConverter = "cvtSdwaVOP2"; +} +  class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {    list<dag> ret = !if(P.HasModifiers,      [(set P.DstVT:$vdst, @@ -121,10 +138,10 @@ multiclass VOP2Inst <string opName,    def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,               Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; -  def _sdwa : VOP2_SDWA_Pseudo <opName, P>; +  def _sdwa  : VOP2_SDWA_Pseudo <opName, P>; +  def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>;  } -// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst  multiclass VOP2bInst <string opName,                        VOPProfile P,                        SDPatternOperator node = null_frag, @@ -136,7 +153,13 @@ multiclass VOP2bInst <string opName,        def _e32 : VOP2_Pseudo <opName, P>,                   Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; -      def _sdwa : VOP2_SDWA_Pseudo <opName, P>; +      def _sdwa  : VOP2_SDWA_Pseudo <opName, P> { +        let AsmMatchConverter = "cvtSdwaVOP2b"; +      } + +      def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> { +        let AsmMatchConverter = "cvtSdwaVOP2b"; +      }      }      def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -203,13 +226,21 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {                       VGPR_32:$src2, // stub argument                       clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,                       src0_sel:$src0_sel, src1_sel:$src1_sel); +  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, +                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, +                      VGPR_32:$src2, // stub argument +                      clampmod:$clamp, omod:$omod, +                      dst_sel:$dst_sel, dst_unused:$dst_unused, +                      src0_sel:$src0_sel, src1_sel:$src1_sel);    let Asm32 = getAsm32<1, 2, vt>.ret;    let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;    let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; -  let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret; +  let AsmSDWA = getAsmSDWA<1, 2, vt>.ret; +  let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;    let HasSrc2 = 0;    let HasSrc2Mods = 0;    let HasExt = 1; +  let HasSDWA9 = 0;  }  def VOP_MAC_F16 : VOP_MAC <f16> { @@ -229,6 +260,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {    let Asm32 = "$vdst, vcc, $src0, $src1";    let Asm64 = "$vdst, $sdst, $src0, $src1";    let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; +  let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";    let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";    let Outs32 = (outs DstRC:$vdst);    let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -246,6 +278,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {    let Asm32 = "$vdst, vcc, $src0, $src1, vcc";    let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";    let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; +  let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";    let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";    let Outs32 = (outs DstRC:$vdst);    let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -254,16 +287,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {    // implicit VCC use.    let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); -  let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0, -                     Src1Mod:$src1_modifiers, Src1SDWA:$src1, +  let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, +                     Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,                       clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,                       src0_sel:$src0_sel, src1_sel:$src1_sel); +  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, +                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, +                      clampmod:$clamp, omod:$omod, +                      dst_sel:$dst_sel, dst_unused:$dst_unused, +                      src0_sel:$src0_sel, src1_sel:$src1_sel); +    let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,                      Src1Mod:$src1_modifiers, Src1DPP:$src1,                      dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,                      bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);    let HasExt = 1; +  let HasSDWA9 = 1;  }  // Read in from vcc or arbitrary SGPR @@ -387,7 +427,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;  } // End let SubtargetPredicate = SICI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in {  def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;  defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; @@ -418,7 +458,7 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;  }  } // End isCommutable = 1 -} // End SubtargetPredicate = isVI +} // End SubtargetPredicate = Has16BitInsts  // Note: 16-bit instructions produce a 0 result in the high 16-bits.  multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { @@ -468,7 +508,7 @@ class ZExt_i16_i1_Pat <SDNode ext> : Pat <    (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)  >; -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in {  defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;  defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>; @@ -513,7 +553,7 @@ def : Pat<    (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)  >; -} // End Predicates = [isVI] +} // End Predicates = [Has16BitInsts]  //===----------------------------------------------------------------------===//  // SI @@ -686,15 +726,21 @@ multiclass VOP2_SDWA_Real <bits<6> op> {      VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;  } +multiclass VOP2_SDWA9_Real <bits<6> op> { +  def _sdwa_gfx9 : +    VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>, +    VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; +} +  multiclass VOP2be_Real_e32e64_vi <bits<6> op> : -  Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> { +  Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {    // For now left dpp only for asm/dasm    // TODO: add corresponding pseudo    def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;  }  multiclass VOP2_Real_e32e64_vi <bits<6> op> : -  Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> { +  Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {    // For now left dpp only for asm/dasm    // TODO: add corresponding pseudo    def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index c0b5069948fb..001fc960b228 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -243,7 +243,7 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;  } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in {  let isCommutable = 1 in { @@ -258,12 +258,13 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>;  def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;  }  // End isCommutable = 1 +} // End SubtargetPredicate = Has16BitInsts +let SubtargetPredicate = isVI in {  def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; -  } // End SubtargetPredicate = isVI -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in {  multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,                               Instruction inst, SDPatternOperator op3> { @@ -288,7 +289,7 @@ def : Pat<  defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;  defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>; -} // End Predicates = [isVI] +} // End Predicates = [Has16BitInsts]  let SubtargetPredicate = isGFX9 in {  def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>; diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index a3550a63677b..cd347b86d305 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -34,6 +34,17 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {    let Inst{44-43} = SDWA.UNUSED_PRESERVE;  } +class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> { +  bits<9> src1; + +  let Inst{8-0}   = 0xf9; // sdwa +  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0); +  let Inst{24-17} = op; +  let Inst{31-25} = 0x3e; // encoding +  let Inst{63}    = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr +} + +  //===----------------------------------------------------------------------===//  // VOPC classes  //===----------------------------------------------------------------------===// @@ -102,6 +113,11 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :    let AsmMatchConverter = "cvtSdwaVOPC";  } +class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : +  VOP_SDWA9_Pseudo <OpName, P, pattern> { +  let AsmMatchConverter = "cvtSdwaVOPC"; +} +  // This class is used only with VOPC instructions. Use $sdst for out operand  class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :    InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl { @@ -173,6 +189,13 @@ multiclass VOPC_Pseudos <string opName,      let isConvergent = DefExec;      let isCompare = 1;    } + +  def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> { +    let Defs = !if(DefExec, [VCC, EXEC], [VCC]); +    let SchedRW = P.Schedule; +    let isConvergent = DefExec; +    let isCompare = 1; +  }  }  def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>; @@ -520,7 +543,11 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :    let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,                       Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,                       clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); +  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, +                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, +                      src0_sel:$src0_sel, src1_sel:$src1_sel);    let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel"; +  //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel";    let HasSrc1Mods = 0;    let HasClamp = 0;    let HasOMod = 0; @@ -553,6 +580,12 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {      let SchedRW = p.Schedule;      let isConvergent = DefExec;    } + +  def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> { +    let Defs = !if(DefExec, [VCC, EXEC], [VCC]); +    let SchedRW = p.Schedule; +    let isConvergent = DefExec; +  }  }  def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>; @@ -920,6 +953,10 @@ multiclass VOPC_Real_vi <bits<10> op> {      VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,      VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; +  def _sdwa_gfx9 : +    VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>, +    VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; +    def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),                         !cast<Instruction>(NAME#"_e32_vi")> {      let AssemblerPredicate = isVI; diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 69906c419db3..4da654f84f9d 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -293,11 +293,52 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {    let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);    let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);    let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); -  let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);    let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); +  let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);    let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); +  let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);    let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); +} + +// gfx9 SDWA basic encoding +class VOP_SDWA9e<VOPProfile P> : Enc64 { +  bits<9> src0; // {src0_sgpr{0}, src0{7-0}} +  bits<3> src0_sel; +  bits<2> src0_modifiers; // float: {abs,neg}, int {sext} +  bits<3> src1_sel; +  bits<2> src1_modifiers; +  bits<1> src1_sgpr; + +  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); +  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); +  let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); +  let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); +  let Inst{55}    = !if(P.HasSrc0, src0{8}, 0); +  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);    let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); +  let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); +  let Inst{63}    = 0; // src1_sgpr - should be specified in subclass +} + +// gfx9 SDWA-A +class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> { +  bits<3> dst_sel; +  bits<2> dst_unused; +  bits<1> clamp; +  bits<2> omod; + +  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD); +  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); +  let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0); +  let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0); +} + +// gfx9 SDWA-B +class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> { +  bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}} + +  let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0); +  let Inst{47} = !if(P.EmitDst, sdst{7}, 0);  }  class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : @@ -331,6 +372,50 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :    VOPProfile Pfl = P;  } +// GFX9 adds two features to SDWA: +// 1.	Add 3 fields to the SDWA microcode word: S0, S1 and OMOD. +//    a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather +//       than VGPRs (at most 1 can be an SGPR); +//    b. OMOD is the standard output modifier (result *2, *4, /2) +// 2.	Add a new version of the SDWA microcode word for VOPC: SDWAB. This +//    replaces OMOD and the dest fields with SD and SDST (SGPR destination) +//    field. +//    a. When SD=1, the SDST is used as the destination for the compare result; +//    b.when SD=0, VCC is used. +//  +// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA + +class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : +  InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>, +  VOP <opName>, +  SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>, +  MnemonicAlias <opName#"_sdwa9", opName> { + +  let isPseudo = 1; +  let isCodeGenOnly = 1; +  let UseNamedOperandTable = 1; + +  string Mnemonic = opName; +  string AsmOperands = P.AsmSDWA9; + +  let Size = 8; +  let mayLoad = 0; +  let mayStore = 0; +  let hasSideEffects = 0; + +  let VALU = 1; +  let SDWA = 1; +  let Uses = [EXEC]; + +  let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); +  let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); +  let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9, +                                     AMDGPUAsmVariants.Disable); +  let DecoderNamespace = "SDWA9"; + +  VOPProfile Pfl = P; +} +  class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :    InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,    SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { @@ -358,6 +443,33 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :    let TSFlags              = ps.TSFlags;  } +class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> : +  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, +  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { + +  let isPseudo = 0; +  let isCodeGenOnly = 0; + +  let Defs = ps.Defs; +  let Uses = ps.Uses; +  let SchedRW = ps.SchedRW; +  let hasSideEffects = ps.hasSideEffects; + +  let Constraints     = ps.Constraints; +  let DisableEncoding = ps.DisableEncoding; + +  // Copy relevant pseudo op flags +  let SubtargetPredicate   = ps.SubtargetPredicate; +  let AssemblerPredicate   = ps.AssemblerPredicate; +  let AsmMatchConverter    = ps.AsmMatchConverter; +  let AsmVariantName       = ps.AsmVariantName; +  let UseNamedOperandTable = ps.UseNamedOperandTable; +  let DecoderNamespace     = ps.DecoderNamespace; +  let Constraints          = ps.Constraints; +  let DisableEncoding      = ps.DisableEncoding; +  let TSFlags              = ps.TSFlags; +} +  class VOP_DPPe<VOPProfile P> : Enc64 {    bits<2> src0_modifiers;    bits<8> src0;  | 
