diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-05-29 16:25:25 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-05-29 16:25:25 +0000 |
commit | ab44ce3d598882e51a25eb82eb7ae6308de85ae6 (patch) | |
tree | 568d786a59d49bef961dcb9bd09d422701b9da5b /lib/Target/AMDGPU | |
parent | b5630dbadf9a2a06754194387d6b0fd9962a67f1 (diff) |
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
30 files changed, 891 insertions, 200 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index b279bd61e1809..e7ebb37a9d62e 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -425,7 +425,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32, FeatureDPP, + FeatureFastFMAF32, FeatureSDWA, FeatureDPP, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts ] >; @@ -534,10 +534,12 @@ def AMDGPUAsmVariants { int VOP3_ID = 1; string SDWA = "SDWA"; int SDWA_ID = 2; + string SDWA9 = "SDWA9"; + int SDWA9_ID = 3; string DPP = "DPP"; - int DPP_ID = 3; + int DPP_ID = 4; string Disable = "Disable"; - int Disable_ID = 4; + int Disable_ID = 5; } def DefaultAMDGPUAsmParserVariant : AsmParserVariant { @@ -555,6 +557,12 @@ def SDWAAsmParserVariant : AsmParserVariant { let Name = AMDGPUAsmVariants.SDWA; } +def SDWA9AsmParserVariant : AsmParserVariant { + let Variant = AMDGPUAsmVariants.SDWA9_ID; + let Name = AMDGPUAsmVariants.SDWA9; +} + + def DPPAsmParserVariant : AsmParserVariant { let Variant = AMDGPUAsmVariants.DPP_ID; let Name = AMDGPUAsmVariants.DPP; @@ -567,6 +575,7 @@ def AMDGPU : Target { let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant, VOP3AsmParserVariant, SDWAAsmParserVariant, + SDWA9AsmParserVariant, DPPAsmParserVariant]; let AssemblyWriters = [AMDGPUAsmWriter]; } @@ -607,7 +616,10 @@ def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<"FeatureVOP3P">; def HasSDWA : Predicate<"Subtarget->hasSDWA()">, - AssemblerPredicate<"FeatureSDWA">; + AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; + +def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"FeatureSDWA,FeatureGFX9">; def HasDPP : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<"FeatureDPP">; diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 5ec46a8294c0c..723e8a7b54e2f 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -127,6 +127,29 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } +bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op) +{ + assert(Op.getOpcode() == ISD::OR); + + SDValue N0 = Op->getOperand(0); + SDValue N1 = Op->getOperand(1); + EVT VT = N0.getValueType(); + + if (VT.isInteger() && !VT.isVector()) { + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(N0, LHSKnown); + + if (LHSKnown.Zero.getBoolValue()) { + DAG.computeKnownBits(N1, RHSKnown); + + if (!(~RHSKnown.Zero & ~LHSKnown.Zero)) + return true; + } + } + + return false; +} + AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -2596,8 +2619,6 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); - if (VT != MVT::i64) - return SDValue(); ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (!RHS) @@ -2618,6 +2639,8 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { // shl (ext x) => zext (shl x), if shift does not overflow int + if (VT != MVT::i64) + break; KnownBits Known; SDValue X = LHS->getOperand(0); DAG.computeKnownBits(X, Known); @@ -2628,8 +2651,23 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); return DAG.getZExtOrTrunc(Shl, SL, VT); } + case ISD::OR: if (!isOrEquivalentToAdd(DAG, LHS)) break; + case ISD::ADD: { // Fall through from above + // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) + if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { + SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), + SDValue(RHS, 0)); + SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal, + SDLoc(C2), VT); + return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V); + } + break; + } } + if (VT != MVT::i64) + return SDValue(); + // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) // On some subtargets, 64-bit shift is a quarter rate instruction. In the @@ -3440,7 +3478,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DL); } - if ((OffsetVal + WidthVal) >= 32) { + if ((OffsetVal + WidthVal) >= 32 && + !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, BitsFrom, ShiftVal); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index fb2f15022d259..0d066cdbdff4d 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -34,6 +34,9 @@ private: /// compare. SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const; +public: + static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op); + protected: const AMDGPUSubtarget *Subtarget; AMDGPUAS AMDGPUASI; diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 9de302994e680..57905be188134 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -36,6 +36,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { setAction({G_CONSTANT, S32}, Legal); setAction({G_CONSTANT, S64}, Legal); + setAction({G_FCONSTANT, S32}, Legal); + setAction({G_GEP, P1}, Legal); setAction({G_GEP, P2}, Legal); setAction({G_GEP, 1, S64}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 85184b363905e..07f92918a43fe 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -97,6 +97,9 @@ private: Instruction *UseInst, int OpIdx0, int OpIdx1) const; + /// Check whether we have enough local memory for promotion. + bool hasSufficientLocalMem(const Function &F); + public: static char ID; @@ -107,7 +110,7 @@ public: StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } - void handleAlloca(AllocaInst &I); + bool handleAlloca(AllocaInst &I, bool SufficientLDS); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); if (!ST.isPromoteAllocaEnabled()) return false; - AS = AMDGPU::getAMDGPUAS(*F.getParent()); - - FunctionType *FTy = F.getFunctionType(); - - // If the function has any arguments in the local address space, then it's - // possible these arguments require the entire local memory space, so - // we cannot use local memory in the pass. - for (Type *ParamTy : FTy->params()) { - PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); - if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { - LocalMemLimit = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " - "local memory disabled.\n"); - return false; - } - } - - LocalMemLimit = ST.getLocalMemorySize(); - if (LocalMemLimit == 0) - return false; - - const DataLayout &DL = Mod->getDataLayout(); - - // Check how much local memory is being used by global objects - CurrentLocalMemUsage = 0; - for (GlobalVariable &GV : Mod->globals()) { - if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) - continue; - - for (const User *U : GV.users()) { - const Instruction *Use = dyn_cast<Instruction>(U); - if (!Use) - continue; - - if (Use->getParent()->getParent() == &F) { - unsigned Align = GV.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV.getValueType()); - // FIXME: Try to account for padding here. The padding is currently - // determined from the inverse order of uses in the function. I'm not - // sure if the use list order is in any way connected to this, so the - // total reported size is likely incorrect. - uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); - CurrentLocalMemUsage += AllocSize; - break; - } - } - } - - unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, - F); - - // Restrict local memory usage so that we don't drastically reduce occupancy, - // unless it is already significantly reduced. - - // TODO: Have some sort of hint or other heuristics to guess occupancy based - // on other factors.. - unsigned OccupancyHint = ST.getWavesPerEU(F).second; - if (OccupancyHint == 0) - OccupancyHint = 7; - - // Clamp to max value. - OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); - - // Check the hint but ignore it if it's obviously wrong from the existing LDS - // usage. - MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); - - - // Round up to the next tier of usage. - unsigned MaxSizeWithWaveCount - = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); - - // Program is possibly broken by using more local mem than available. - if (CurrentLocalMemUsage > MaxSizeWithWaveCount) - return false; - - LocalMemLimit = MaxSizeWithWaveCount; - - DEBUG( - dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" - << " Rounding size to " << MaxSizeWithWaveCount - << " with a maximum occupancy of " << MaxOccupancy << '\n' - << " and " << (LocalMemLimit - CurrentLocalMemUsage) - << " available for promotion\n" - ); + AS = AMDGPU::getAMDGPUAS(*F.getParent()); + bool SufficientLDS = hasSufficientLocalMem(F); + bool Changed = false; BasicBlock &EntryBB = *F.begin(); for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) { AllocaInst *AI = dyn_cast<AllocaInst>(I); ++I; if (AI) - handleAlloca(*AI); + Changed |= handleAlloca(*AI, SufficientLDS); } - return true; + return Changed; } std::pair<Value *, Value *> @@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( return true; } +bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { + + FunctionType *FTy = F.getFunctionType(); + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (Type *ParamTy : FTy->params()) { + PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); + if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { + LocalMemLimit = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + return false; + } + } + + LocalMemLimit = ST.getLocalMemorySize(); + if (LocalMemLimit == 0) + return false; + + const DataLayout &DL = Mod->getDataLayout(); + + // Check how much local memory is being used by global objects + CurrentLocalMemUsage = 0; + for (GlobalVariable &GV : Mod->globals()) { + if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) + continue; + + for (const User *U : GV.users()) { + const Instruction *Use = dyn_cast<Instruction>(U); + if (!Use) + continue; + + if (Use->getParent()->getParent() == &F) { + unsigned Align = GV.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV.getValueType()); + + // FIXME: Try to account for padding here. The padding is currently + // determined from the inverse order of uses in the function. I'm not + // sure if the use list order is in any way connected to this, so the + // total reported size is likely incorrect. + uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage += AllocSize; + break; + } + } + } + + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, + F); + + // Restrict local memory usage so that we don't drastically reduce occupancy, + // unless it is already significantly reduced. + + // TODO: Have some sort of hint or other heuristics to guess occupancy based + // on other factors.. + unsigned OccupancyHint = ST.getWavesPerEU(F).second; + if (OccupancyHint == 0) + OccupancyHint = 7; + + // Clamp to max value. + OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); + + // Check the hint but ignore it if it's obviously wrong from the existing LDS + // usage. + MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + + + // Round up to the next tier of usage. + unsigned MaxSizeWithWaveCount + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); + + // Program is possibly broken by using more local mem than available. + if (CurrentLocalMemUsage > MaxSizeWithWaveCount) + return false; + + LocalMemLimit = MaxSizeWithWaveCount; + + DEBUG( + dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" + << " Rounding size to " << MaxSizeWithWaveCount + << " with a maximum occupancy of " << MaxOccupancy << '\n' + << " and " << (LocalMemLimit - CurrentLocalMemUsage) + << " available for promotion\n" + ); + + return true; +} + // FIXME: Should try to pick the most likely to be profitable allocas first. -void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { +bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) - return; + return false; IRBuilder<> Builder(&I); @@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I, AS)) { - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); - return; - } + if (tryPromoteAllocaToVector(&I, AS)) + return true; // Promoted to vector. const Function &ContainingFunction = *I.getParent()->getParent(); CallingConv::ID CC = ContainingFunction.getCallingConv(); @@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { break; default: DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n"); - return; + return false; } + // Not likely to have sufficient local memory for promotion. + if (!SufficientLDS) + return false; + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; @@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (NewSize > LocalMemLimit) { DEBUG(dbgs() << " " << AllocSize << " bytes of local memory not available to promote\n"); - return; + return false; } CurrentLocalMemUsage = NewSize; @@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); - return; + return false; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); @@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } + return true; } FunctionPass *llvm::createAMDGPUPromoteAlloca() { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index e543cae07ada0..660879426810f 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -416,6 +416,10 @@ public: return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; } + bool hasSDWA() const { + return HasSDWA; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { @@ -670,10 +674,6 @@ public: return HasInv2PiInlineImm; } - bool hasSDWA() const { - return HasSDWA; - } - bool hasDPP() const { return HasDPP; } diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b52ea2b3a2c61..f5541e08e1b72 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -881,6 +881,10 @@ public: return AMDGPU::isVI(getSTI()); } + bool isGFX9() const { + return AMDGPU::isGFX9(getSTI()); + } + bool hasInv2PiInlineImm() const { return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } @@ -989,7 +993,6 @@ private: bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; - bool isSGPR(unsigned Reg); public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -1042,9 +1045,10 @@ public: OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands); void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands); + void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands); void cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType); + uint64_t BasicInstType, bool skipVcc = false); }; struct OptionalOperand { @@ -1966,7 +1970,8 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { } if (isForcedSDWA()) { - static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA}; + static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA, + AMDGPUAsmVariants::SDWA9}; return makeArrayRef(Variants); } @@ -1977,7 +1982,7 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { static const unsigned Variants[] = { AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3, - AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP + AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP }; return makeArrayRef(Variants); @@ -2000,14 +2005,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { return AMDGPU::NoRegister; } -bool AMDGPUAsmParser::isSGPR(unsigned Reg) { - const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); - const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); - return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || - Reg == AMDGPU::SCC; -} - // NB: This code is correct only when used to check constant // bus limitations because GFX7 support no f16 inline constants. // Note that there are no cases when a GFX7 opcode violates @@ -2049,7 +2046,8 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { if (MO.isImm()) { return !isInlineConstant(Inst, OpIdx); } - return !MO.isReg() || isSGPR(mc2PseudoReg(MO.getReg())); + return !MO.isReg() || + isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo()); } bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) { @@ -2060,7 +2058,8 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) { if (Desc.TSFlags & (SIInstrFlags::VOPC | SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | - SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) { + SIInstrFlags::VOP3 | SIInstrFlags::VOP3P | + SIInstrFlags::SDWA)) { // Check special imm operands (used by madmk, etc) if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) { @@ -4151,14 +4150,19 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) { cvtSDWA(Inst, Operands, SIInstrFlags::VOP2); } +void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true); +} + void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { - cvtSDWA(Inst, Operands, SIInstrFlags::VOPC); + cvtSDWA(Inst, Operands, SIInstrFlags::VOPC, isVI()); } void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType) { + uint64_t BasicInstType, bool skipVcc) { using namespace llvm::AMDGPU::SDWA; OptionalImmIndexMap OptionalIdx; + bool skippedVcc = false; unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); @@ -4168,15 +4172,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - // Add the register arguments - if ((BasicInstType == SIInstrFlags::VOPC || - BasicInstType == SIInstrFlags::VOP2)&& - Op.isReg() && - Op.Reg.RegNo == AMDGPU::VCC) { - // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. - // Skip it. - continue; - } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { + // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. + // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3) + // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand. + // Skip VCC only if we didn't skip it on previous iteration. + if (BasicInstType == SIInstrFlags::VOP2 && + (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) { + skippedVcc = true; + continue; + } else if (BasicInstType == SIInstrFlags::VOPC && + Inst.getNumOperands() == 0) { + skippedVcc = true; + continue; + } + } + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithInputModsOperands(Inst, 2); } else if (Op.isImm()) { // Handle optional arguments @@ -4184,20 +4195,30 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } else { llvm_unreachable("Invalid operand type"); } + skippedVcc = false; } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - - if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { + if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && + Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { // V_NOP_sdwa_vi has no optional sdwa arguments switch (BasicInstType) { case SIInstrFlags::VOP1: + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + if (isGFX9() && + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); + } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); break; case SIInstrFlags::VOP2: + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + if (isGFX9() && + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); + } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); @@ -4205,6 +4226,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, break; case SIInstrFlags::VOPC: + if (isVI()) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; @@ -4220,10 +4244,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) { auto it = Inst.begin(); std::advance( - it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); + it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); Inst.insert(it, Inst.getOperand(0)); // src2 = dst } - } /// Force static initialization. diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 137b5cca96ce8..9b3cde7c4df60 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -62,32 +62,33 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, return addOperand(Inst, MCOperand::createImm(Imm)); } -#define DECODE_OPERAND2(RegClass, DecName) \ -static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \ - unsigned Imm, \ - uint64_t /*Addr*/, \ - const void *Decoder) { \ +#define DECODE_OPERAND(StaticDecoderName, DecoderName) \ +static DecodeStatus StaticDecoderName(MCInst &Inst, \ + unsigned Imm, \ + uint64_t /*Addr*/, \ + const void *Decoder) { \ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \ - return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \ + return addOperand(Inst, DAsm->DecoderName(Imm)); \ } -#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass) +#define DECODE_OPERAND_REG(RegClass) \ +DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) -DECODE_OPERAND(VGPR_32) -DECODE_OPERAND(VS_32) -DECODE_OPERAND(VS_64) +DECODE_OPERAND_REG(VGPR_32) +DECODE_OPERAND_REG(VS_32) +DECODE_OPERAND_REG(VS_64) -DECODE_OPERAND(VReg_64) -DECODE_OPERAND(VReg_96) -DECODE_OPERAND(VReg_128) +DECODE_OPERAND_REG(VReg_64) +DECODE_OPERAND_REG(VReg_96) +DECODE_OPERAND_REG(VReg_128) -DECODE_OPERAND(SReg_32) -DECODE_OPERAND(SReg_32_XM0_XEXEC) -DECODE_OPERAND(SReg_64) -DECODE_OPERAND(SReg_64_XEXEC) -DECODE_OPERAND(SReg_128) -DECODE_OPERAND(SReg_256) -DECODE_OPERAND(SReg_512) +DECODE_OPERAND_REG(SReg_32) +DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) +DECODE_OPERAND_REG(SReg_64) +DECODE_OPERAND_REG(SReg_64_XEXEC) +DECODE_OPERAND_REG(SReg_128) +DECODE_OPERAND_REG(SReg_256) +DECODE_OPERAND_REG(SReg_512) static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, @@ -106,6 +107,13 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); } +#define DECODE_SDWA9(DecName) \ +DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName) + +DECODE_SDWA9(Src32) +DECODE_SDWA9(Src16) +DECODE_SDWA9(VopcDst) + #include "AMDGPUGenDisassemblerTables.inc" //===----------------------------------------------------------------------===// @@ -164,6 +172,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address); if (Res) break; + + Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); + if (Res) break; } // Reinitialize Bytes as DPP64 could have eaten too much @@ -582,6 +593,48 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { return errOperand(Val, "unknown operand encoding " + Twine(Val)); } +MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width, + unsigned Val) const { + using namespace AMDGPU::SDWA; + + if (SDWA9EncValues::SRC_VGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_VGPR_MAX) { + return createRegOperand(getVgprClassId(Width), + Val - SDWA9EncValues::SRC_VGPR_MIN); + } + if (SDWA9EncValues::SRC_SGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_SGPR_MAX) { + return createSRegOperand(getSgprClassId(Width), + Val - SDWA9EncValues::SRC_SGPR_MIN); + } + + return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); +} + +MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const { + return decodeSDWA9Src(OPW16, Val); +} + +MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const { + return decodeSDWA9Src(OPW32, Val); +} + + +MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const { + using namespace AMDGPU::SDWA; + + if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { + Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; + if (Val > AMDGPU::EncValues::SGPR_MAX) { + return decodeSpecialReg64(Val); + } else { + return createSRegOperand(getSgprClassId(OPW64), Val); + } + } else { + return createRegOperand(AMDGPU::VCC); + } +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 620bae0a6d1a9..0ff405a71e9be 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -104,6 +104,11 @@ public: MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; + + MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeSDWA9Src16(unsigned Val) const; + MCOperand decodeSDWA9Src32(unsigned Val) const; + MCOperand decodeSDWA9VopcDst(unsigned Val) const; }; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 3bb5c9bc22b7d..8ead480673363 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -191,6 +191,7 @@ public: } }; +namespace { // just a stub to make base class happy class SchedStrategyStub : public MachineSchedStrategy { public: @@ -202,6 +203,7 @@ public: void releaseTopNode(SUnit *SU) override {} void releaseBottomNode(SUnit *SU) override {} }; +} // namespace GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C, StrategyKind S) diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index c6d0f21799508..d378df674be9b 100644 --- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -17,6 +17,7 @@ using namespace llvm; #define DEBUG_TYPE "misched" +namespace { class GCNMinRegScheduler { struct Candidate : ilist_node<Candidate> { const SUnit *SU; @@ -71,6 +72,7 @@ public: std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots, const ScheduleDAG &DAG); }; +} // namespace void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) { NumPreds.resize(SUnits.size()); diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 18374dca3f840..390a8286c76a8 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -211,9 +211,9 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO, return getLiveLaneMask(MO.getReg(), SI, LIS, MRI); } -SmallVector<RegisterMaskPair, 8> collectVirtualRegUses(const MachineInstr &MI, - const LiveIntervals &LIS, - const MachineRegisterInfo &MRI) { +static SmallVector<RegisterMaskPair, 8> +collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { SmallVector<RegisterMaskPair, 8> Res; for (const auto &MO : MI.operands()) { if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 3d3858ab47ece..a856b17a228f0 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -52,6 +52,18 @@ public: return 0; } + virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + + virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + protected: uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; void verifyInstructionPredicates(const MCInst &MI, diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index bda0928036fde..e02acf516c0db 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -69,6 +69,14 @@ public: unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; + + unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; }; } // end anonymous namespace @@ -319,6 +327,44 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, return getMachineOpValue(MI, MO, Fixups, STI); } +unsigned +SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + using namespace AMDGPU::SDWA; + + uint64_t RegEnc = 0; + + const MCOperand &MO = MI.getOperand(OpNo); + + unsigned Reg = MO.getReg(); + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::SRC_VGPR_MASK; + if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { + RegEnc |= SDWA9EncValues::SRC_SGPR_MASK; + } + return RegEnc; +} + +unsigned +SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + using namespace AMDGPU::SDWA; + + uint64_t RegEnc = 0; + + const MCOperand &MO = MI.getOperand(OpNo); + + unsigned Reg = MO.getReg(); + if (Reg != AMDGPU::VCC) { + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; + RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; + } + return RegEnc; +} + uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 3590a9b05e1d0..60b913cfd39af 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1618,6 +1618,14 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const { + // Local and Private addresses do not handle vectors. Limit to i32 + if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) { + return (MemVT.getSizeInBits() <= 32); + } + return true; +} + bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 9700ce14c6f31..d6a0876a6ee7d 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -44,6 +44,8 @@ public: EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const override; + bool canMergeStoresTo(unsigned AS, EVT MemVT) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override; diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td index cc667d985a82e..3c1e8527284cf 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.td +++ b/lib/Target/AMDGPU/R600RegisterInfo.td @@ -226,7 +226,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add R600_Addr, R600_KC0, R600_KC1, ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, - ALU_CONST, ALU_PARAM, OQAP + ALU_CONST, ALU_PARAM, OQAP, INDIRECT_BASE_ADDR )>; def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index a01330cb9171e..80967edee0ab1 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -118,6 +118,10 @@ namespace AMDGPU { // Operand for source modifiers for VOP instructions OPERAND_INPUT_MODS, + // Operand for GFX9 SDWA instructions + OPERAND_SDWA9_SRC, + OPERAND_SDWA9_VOPC_DST, + /// Operand with 32-bit immediate that uses the constant bus. OPERAND_KIMM32, OPERAND_KIMM16 @@ -160,7 +164,8 @@ namespace AMDGPUAsmVariants { DEFAULT = 0, VOP3 = 1, SDWA = 2, - DPP = 3 + SDWA9 = 3, + DPP = 4 }; } @@ -294,6 +299,18 @@ enum DstUnused { UNUSED_PRESERVE = 2, }; +enum SDWA9EncValues{ + SRC_SGPR_MASK = 0x100, + SRC_VGPR_MASK = 0xFF, + VOPC_DST_VCC_MASK = 0x80, + VOPC_DST_SGPR_MASK = 0x7F, + + SRC_VGPR_MIN = 0, + SRC_VGPR_MAX = 255, + SRC_SGPR_MIN = 256, + SRC_SGPR_MAX = 357, +}; + } // namespace SDWA } // namespace AMDGPU diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 01c1f78e7ca40..76c2644867aa1 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -698,6 +698,18 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } } +bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const { + if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { + return (MemVT.getSizeInBits() <= 4 * 32); + } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { + unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); + return (MemVT.getSizeInBits() <= MaxPrivateBits); + } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { + return (MemVT.getSizeInBits() <= 2 * 32); + } + return true; +} + bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, @@ -4229,12 +4241,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, SDValue RHS = N->getOperand(1); - if (VT == MVT::i64) { - const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); - if (CRHS) { - if (SDValue Split - = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) - return Split; + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); + if (VT == MVT::i64 && CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) + return Split; + } + + if (CRHS && VT == MVT::i32) { + // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb + // nb = number of trailing zeroes in mask + // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, + // given that we are selecting 8 or 16 bit fields starting at byte boundary. + uint64_t Mask = CRHS->getZExtValue(); + unsigned Bits = countPopulation(Mask); + if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && + (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { + if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { + unsigned Shift = CShift->getZExtValue(); + unsigned NB = CRHS->getAPIntValue().countTrailingZeros(); + unsigned Offset = NB + Shift; + if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. + SDLoc SL(N); + SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + LHS->getOperand(0), + DAG.getConstant(Offset, SL, MVT::i32), + DAG.getConstant(Bits, SL, MVT::i32)); + EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); + SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, + DAG.getValueType(NarrowVT)); + SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, + DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); + return Shl; + } + } } } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index e68837747491d..8e2ec40b224cd 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -150,6 +150,8 @@ public: bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + bool canMergeStoresTo(unsigned AS, EVT MemVT) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 38a16b525a75f..36d29b8ecf066 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2331,6 +2331,10 @@ static bool isSubRegOf(const SIRegisterInfo &TRI, bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); + + if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) + return true; + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 7b052844f177b..c5287c7f64ba4 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -439,6 +439,27 @@ def ExpSrc3 : RegisterOperand<VGPR_32> { let ParserMatchClass = VReg32OrOffClass; } +class SDWA9Src : RegisterOperand<VS_32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_SDWA9_SRC"; + let EncoderMethod = "getSDWA9SrcEncoding"; +} + +def SDWA9Src32 : SDWA9Src { + let DecoderMethod = "decodeSDWA9Src32"; +} + +def SDWA9Src16 : SDWA9Src { + let DecoderMethod = "decodeSDWA9Src16"; +} + +def SDWA9VopcDst : VOPDstOperand<SReg_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_SDWA9_VOPC_DST"; + let EncoderMethod = "getSDWA9VopcDstEncoding"; + let DecoderMethod = "decodeSDWA9VopcDst"; +} + class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass { let Name = "Imm"#CName; let PredicateMethod = "is"#CName; @@ -588,6 +609,16 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> def Int32InputMods : IntInputMods<Int32InputModsMatchClass>; def Int64InputMods : IntInputMods<Int64InputModsMatchClass>; +def FPRegInputModsMatchClass : AsmOperandClass { + let Name = "RegWithFPInputMods"; + let ParserMethod = "parseRegWithFPInputMods"; + let PredicateMethod = "isRegKind"; +} + +def FPRegInputMods : InputMods <FPRegInputModsMatchClass> { + let PrintMethod = "printOperandAndFPInputMods"; +} + def FPVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; @@ -598,6 +629,17 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> { let PrintMethod = "printOperandAndFPInputMods"; } + +def IntRegInputModsMatchClass : AsmOperandClass { + let Name = "RegWithIntInputMods"; + let ParserMethod = "parseRegWithIntInputMods"; + let PredicateMethod = "isRegKind"; +} + +def IntRegInputMods : InputMods <IntRegInputModsMatchClass> { + let PrintMethod = "printOperandAndIntInputMods"; +} + def IntVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; @@ -783,6 +825,14 @@ class getVALUDstForVT<ValueType VT> { VOPDstOperand<SReg_64>)))); // else VT == i1 } +// Returns the register class to use for the destination of VOP[12C] +// instructions with GFX9 SDWA extension +class getSDWA9DstForVT<ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 1), + SDWA9VopcDst, // VOPC + VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst +} + // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { @@ -823,6 +873,9 @@ class getVregSrcForVT<ValueType VT> { !if(!eq(VT.Size, 64), VReg_64, VGPR_32)); } +class getSDWA9SrcForVT <ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32); +} // Returns the register class to use for sources of VOP3 instructions for the // given VT. @@ -926,6 +979,15 @@ class getSrcModExt <ValueType VT> { Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } +// Return type of input modifiers operand specified input operand for SDWA 9 +class getSrcModSDWA9 <ValueType VT> { + bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + 0))); + Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods); +} + // Returns the input arguments for VOP[12C] instructions for the given SrcVT. class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 @@ -1062,6 +1124,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, // VOP1 without input operands (V_NOP) (ins), !if(!eq(NumSrcArgs, 1), + // VOP1_SDWA (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel), @@ -1071,7 +1134,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP2_SDWA or VOPC_SDWA with modifiers + // VOP2_SDWA with modifiers (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, @@ -1079,12 +1142,65 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, (ins)/* endif */))); } +// Ins for GFX9 SDWA +class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs, + bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod, + ValueType DstVT> { + + dag ret = !if(!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP) + (ins), + !if(!eq(NumSrcArgs, 1), + // VOP1 + !if(!eq(HasSDWAOMod, 0), + // VOP1_SDWA9 without omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel), + // VOP1_SDWA9 with omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel)), + !if(!eq(NumSrcArgs, 2), + !if(!eq(DstVT.Size, 1), + // VOPC_SDWA9 + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA9 + !if(!eq(HasSDWAOMod, 0), + // VOP2_SDWA9 without omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP1_SDWA9 with omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel))), + (ins)/* endif */))); +} + // Outs for DPP and SDWA -class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> { +class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> { dag ret = !if(HasDst, !if(!eq(DstVT.Size, 1), (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions - (outs DstRCDPP:$vdst)), + (outs DstRCExt:$vdst)), + (outs)); // V_NOP +} + +// Outs for GFX9 SDWA +class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> { + dag ret = !if(HasDst, + !if(!eq(DstVT.Size, 1), + (outs DstRCSDWA9:$sdst), + (outs DstRCSDWA9:$vdst)), (outs)); // V_NOP } @@ -1153,8 +1269,7 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; } -class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers, - ValueType DstVT = i32> { +class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), " vcc", // use vcc token as dst for VOPC instructioins @@ -1182,6 +1297,35 @@ class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers, string ret = dst#args#sdwa; } +class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs, + ValueType DstVT = i32> { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + "$sdst", // VOPC + "$vdst"), // VOP1/2 + ""); + string src0 = "$src0_modifiers"; + string src1 = "$src1_modifiers"; + string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod"); + string args = !if(!eq(NumSrcArgs, 0), "", + !if(!eq(NumSrcArgs, 1), + ", "#src0, + ", "#src0#", "#src1 + ) + ); + string sdwa = !if(!eq(NumSrcArgs, 0), "", + !if(!eq(NumSrcArgs, 1), + out_mods#" $dst_sel $dst_unused $src0_sel", + !if(!eq(DstVT.Size, 1), + " $src0_sel $src1_sel", // No dst_sel, dst_unused and output modifiers for VOPC + out_mods#" $dst_sel $dst_unused $src0_sel $src1_sel" + ) + ) + ); + string ret = dst#args#sdwa; +} + + // Function that checks if instruction supports DPP and SDWA class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, ValueType Src1VT = i32> { @@ -1219,6 +1363,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret; field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret; field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret; + field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; @@ -1228,6 +1373,8 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret; field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret; + field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret; + field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret; field Operand Src0Mod = getSrcMod<Src0VT>.ret; field Operand Src1Mod = getSrcMod<Src1VT>.ret; field Operand Src2Mod = getSrcMod<Src2VT>.ret; @@ -1235,6 +1382,8 @@ class VOPProfile <list<ValueType> _ArgVT> { field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret; field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret; field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret; + field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret; + field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret; field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); @@ -1261,14 +1410,16 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0); field bit HasClamp = HasModifiers; - field bit HasSDWAClamp = HasSrc0; + field bit HasSDWAClamp = EmitDst; field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret; field bit IsPacked = isPackedType<Src0VT>.ret; field bit HasOpSel = IsPacked; field bit HasOMod = !if(HasOpSel, 0, HasModifiers); + field bit HasSDWAOMod = isFloatType<DstVT>.ret; field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; + field bit HasSDWA9 = HasExt; field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); @@ -1282,6 +1433,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Outs64 = Outs; field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret; + field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, @@ -1296,16 +1448,21 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasModifiers, Src0ModSDWA, Src1ModSDWA, DstVT>.ret; + field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs, + HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9, + DstVT>.ret; field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret; field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret; field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret; field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; - field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; + field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret; + field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret; } class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { let HasExt = 0; + let HasSDWA9 = 0; } def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; @@ -1446,6 +1603,15 @@ def getSDWAOp : InstrMapping { let ValueCols = [["SDWA"]]; } +// Maps ordinary instructions to their SDWA GFX9 counterparts +def getSDWA9Op : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["SDWA9"]]; +} + def getMaskedMIMGOp : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index f2d8b6f7b7a4b..ec29a66c8bbbe 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -184,7 +184,9 @@ def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">; def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">; def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">; def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">; -def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">; +def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64", + [(set i64:$sdst, (int_amdgcn_s_getpc))] +>; let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 2abd4afad3b6c..630f469eabf05 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -544,6 +544,17 @@ bool isVI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; } +bool isGFX9(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; +} + +bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { + const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); + const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); + return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || + Reg == AMDGPU::SCC; +} + unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { switch(Reg) { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 8e74aa2cc9a8b..19888ad7556a6 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -273,6 +273,10 @@ inline bool isKernel(CallingConv::ID CC) { bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); +bool isGFX9(const MCSubtargetInfo &STI); + +/// \brief Is Reg - scalar register +bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); /// If \p Reg is a pseudo reg, return the correct hardware register given /// \p STI otherwise return \p Reg. diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 1febc6bf8ec20..95b5ef0a49dba 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -30,6 +30,15 @@ class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { let Inst{31-25} = 0x3f; // encoding } +class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> { + bits<8> vdst; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = op; + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; // encoding +} + class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> : InstSI <P.Outs32, P.Ins32, "", pattern>, VOP <opName>, @@ -84,6 +93,11 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP1"; } +class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_SDWA9_Pseudo <OpName, P, pattern> { + let AsmMatchConverter = "cvtSdwaVOP1"; +} + class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, @@ -103,6 +117,7 @@ multiclass VOP1Inst <string opName, VOPProfile P, def _e32 : VOP1_Pseudo <opName, P>; def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; def _sdwa : VOP1_SDWA_Pseudo <opName, P>; + def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>; } // Special profile for instructions which have clamp @@ -243,6 +258,7 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> { let Src0RC64 = VRegSrc_32; let HasExt = 0; + let HasSDWA9 = 0; } // Special case because there are no true output operands. Hack vdst @@ -258,16 +274,21 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0); let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); - let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0, + let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel); + let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel); let Asm32 = getAsm32<1, 1>.ret; let Asm64 = getAsm64<1, 1, 0, 1>.ret; let AsmDPP = getAsmDPP<1, 1, 0>.ret; - let AsmSDWA = getAsmSDWA<1, 1, 0>.ret; + let AsmSDWA = getAsmSDWA<1, 1>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret; let HasExt = 0; + let HasSDWA9 = 0; let HasDst = 0; let EmitDst = 1; // force vdst emission } @@ -324,7 +345,7 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; @@ -347,7 +368,7 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { def : Pat< (f32 (f16_to_fp i16:$src)), @@ -523,6 +544,10 @@ multiclass VOP1_Real_vi <bits<10> op> { VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>, + VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; + // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 4a11d9471f1d6..657cacaa792ca 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -48,6 +48,18 @@ class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> { let Inst{31} = 0x0; // encoding } +class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> { + bits<8> vdst; + bits<9> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding + let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr +} + class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> : InstSI <P.Outs32, P.Ins32, "", pattern>, VOP <opName>, @@ -102,6 +114,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP2"; } +class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_SDWA9_Pseudo <OpName, P, pattern> { + let AsmMatchConverter = "cvtSdwaVOP2"; +} + class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, @@ -121,10 +138,10 @@ multiclass VOP2Inst <string opName, def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; - def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>; } -// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst multiclass VOP2bInst <string opName, VOPProfile P, SDPatternOperator node = null_frag, @@ -136,7 +153,13 @@ multiclass VOP2bInst <string opName, def _e32 : VOP2_Pseudo <opName, P>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; - def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + def _sdwa : VOP2_SDWA_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } + + def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } } def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -203,13 +226,21 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { VGPR_32:$src2, // stub argument clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, + VGPR_32:$src2, // stub argument + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm32 = getAsm32<1, 2, vt>.ret; let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret; let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; - let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret; + let AsmSDWA = getAsmSDWA<1, 2, vt>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret; let HasSrc2 = 0; let HasSrc2Mods = 0; let HasExt = 1; + let HasSDWA9 = 0; } def VOP_MAC_F16 : VOP_MAC <f16> { @@ -229,6 +260,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { let Asm32 = "$vdst, vcc, $src0, $src1"; let Asm64 = "$vdst, $sdst, $src0, $src1"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -246,6 +278,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -254,16 +287,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { // implicit VCC use. let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); - let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0, - Src1Mod:$src1_modifiers, Src1SDWA:$src1, + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0, Src1Mod:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let HasExt = 1; + let HasSDWA9 = 1; } // Read in from vcc or arbitrary SGPR @@ -387,7 +427,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; } // End let SubtargetPredicate = SICI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; @@ -418,7 +458,7 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; } } // End isCommutable = 1 -} // End SubtargetPredicate = isVI +} // End SubtargetPredicate = Has16BitInsts // Note: 16-bit instructions produce a 0 result in the high 16-bits. multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { @@ -468,7 +508,7 @@ class ZExt_i16_i1_Pat <SDNode ext> : Pat < (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) >; -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>; defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>; @@ -513,7 +553,7 @@ def : Pat< (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) >; -} // End Predicates = [isVI] +} // End Predicates = [Has16BitInsts] //===----------------------------------------------------------------------===// // SI @@ -686,15 +726,21 @@ multiclass VOP2_SDWA_Real <bits<6> op> { VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } +multiclass VOP2_SDWA9_Real <bits<6> op> { + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>, + VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; +} + multiclass VOP2be_Real_e32e64_vi <bits<6> op> : - Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> { + Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; } multiclass VOP2_Real_e32e64_vi <bits<6> op> : - Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> { + Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index c0b5069948fb4..001fc960b228c 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -243,7 +243,7 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { let isCommutable = 1 in { @@ -258,12 +258,13 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>; } // End isCommutable = 1 +} // End SubtargetPredicate = Has16BitInsts +let SubtargetPredicate = isVI in { def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; - } // End SubtargetPredicate = isVI -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst, SDPatternOperator op3> { @@ -288,7 +289,7 @@ def : Pat< defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>; defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>; -} // End Predicates = [isVI] +} // End Predicates = [Has16BitInsts] let SubtargetPredicate = isGFX9 in { def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>; diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index a3550a63677ba..cd347b86d3050 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -34,6 +34,17 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { let Inst{44-43} = SDWA.UNUSED_PRESERVE; } +class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> { + bits<9> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; // encoding + let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr +} + + //===----------------------------------------------------------------------===// // VOPC classes //===----------------------------------------------------------------------===// @@ -102,6 +113,11 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOPC"; } +class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_SDWA9_Pseudo <OpName, P, pattern> { + let AsmMatchConverter = "cvtSdwaVOPC"; +} + // This class is used only with VOPC instructions. Use $sdst for out operand class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> : InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl { @@ -173,6 +189,13 @@ multiclass VOPC_Pseudos <string opName, let isConvergent = DefExec; let isCompare = 1; } + + def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = P.Schedule; + let isConvergent = DefExec; + let isCompare = 1; + } } def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>; @@ -520,7 +543,11 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, + src0_sel:$src0_sel, src1_sel:$src1_sel); let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel"; + //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; let HasSrc1Mods = 0; let HasClamp = 0; let HasOMod = 0; @@ -553,6 +580,12 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> { let SchedRW = p.Schedule; let isConvergent = DefExec; } + + def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = p.Schedule; + let isConvergent = DefExec; + } } def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>; @@ -920,6 +953,10 @@ multiclass VOPC_Real_vi <bits<10> op> { VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>, + VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; + def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"), !cast<Instruction>(NAME#"_e32_vi")> { let AssemblerPredicate = isVI; diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 69906c419db3b..4da654f84f9d1 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -293,11 +293,52 @@ class VOP_SDWAe<VOPProfile P> : Enc64 { let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); - let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); + let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); + let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); +} + +// gfx9 SDWA basic encoding +class VOP_SDWA9e<VOPProfile P> : Enc64 { + bits<9> src0; // {src0_sgpr{0}, src0{7-0}} + bits<3> src0_sel; + bits<2> src0_modifiers; // float: {abs,neg}, int {sext} + bits<3> src1_sel; + bits<2> src1_modifiers; + bits<1> src1_sgpr; + + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); + let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); + let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); + let Inst{55} = !if(P.HasSrc0, src0{8}, 0); + let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); + let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); + let Inst{63} = 0; // src1_sgpr - should be specified in subclass +} + +// gfx9 SDWA-A +class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> { + bits<3> dst_sel; + bits<2> dst_unused; + bits<1> clamp; + bits<2> omod; + + let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD); + let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); + let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); + let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0); +} + +// gfx9 SDWA-B +class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> { + bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}} + + let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0); + let Inst{47} = !if(P.EmitDst, sdst{7}, 0); } class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : @@ -331,6 +372,50 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : VOPProfile Pfl = P; } +// GFX9 adds two features to SDWA: +// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD. +// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather +// than VGPRs (at most 1 can be an SGPR); +// b. OMOD is the standard output modifier (result *2, *4, /2) +// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This +// replaces OMOD and the dest fields with SD and SDST (SGPR destination) +// field. +// a. When SD=1, the SDST is used as the destination for the compare result; +// b.when SD=0, VCC is used. +// +// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA + +class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : + InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>, + VOP <opName>, + SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>, + MnemonicAlias <opName#"_sdwa9", opName> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = P.AsmSDWA9; + + let Size = 8; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + + let VALU = 1; + let SDWA = 1; + let Uses = [EXEC]; + + let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); + let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); + let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9, + AMDGPUAsmVariants.Disable); + let DecoderNamespace = "SDWA9"; + + VOPProfile Pfl = P; +} + class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { @@ -358,6 +443,33 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : let TSFlags = ps.TSFlags; } +class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // Copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AssemblerPredicate = ps.AssemblerPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let DecoderNamespace = ps.DecoderNamespace; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; +} + class VOP_DPPe<VOPProfile P> : Enc64 { bits<2> src0_modifiers; bits<8> src0; |