src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2017-05-29 16:25:25 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-05-29 16:25:25 +0000
commit	ab44ce3d598882e51a25eb82eb7ae6308de85ae6 (patch)
tree	568d786a59d49bef961dcb9bd09d422701b9da5b /lib/Target/AMDGPU
parent	b5630dbadf9a2a06754194387d6b0fd9962a67f1 (diff)

vendor/llvm/llvm-trunk-r304149

Diffstat (limited to 'lib/Target/AMDGPU')

-rw-r--r--

lib/Target/AMDGPU/AMDGPU.td

-rw-r--r--

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

-rw-r--r--

lib/Target/AMDGPU/AMDGPUISelLowering.h

-rw-r--r--

lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

-rw-r--r--

lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

213

-rw-r--r--

lib/Target/AMDGPU/AMDGPUSubtarget.h

-rw-r--r--

lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

-rw-r--r--

lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

-rw-r--r--

lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h

-rw-r--r--

lib/Target/AMDGPU/GCNIterativeScheduler.cpp

-rw-r--r--

lib/Target/AMDGPU/GCNMinRegStrategy.cpp

-rw-r--r--

lib/Target/AMDGPU/GCNRegPressure.cpp

-rw-r--r--

lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h

-rw-r--r--

lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp

-rw-r--r--

lib/Target/AMDGPU/R600ISelLowering.cpp

-rw-r--r--

lib/Target/AMDGPU/R600ISelLowering.h

-rw-r--r--

lib/Target/AMDGPU/R600RegisterInfo.td

-rw-r--r--

lib/Target/AMDGPU/SIDefines.h

-rw-r--r--

lib/Target/AMDGPU/SIISelLowering.cpp

-rw-r--r--

lib/Target/AMDGPU/SIISelLowering.h

-rw-r--r--

lib/Target/AMDGPU/SIInstrInfo.cpp

-rw-r--r--

lib/Target/AMDGPU/SIInstrInfo.td

180

-rw-r--r--

lib/Target/AMDGPU/SOPInstructions.td

-rw-r--r--

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

-rw-r--r--

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

-rw-r--r--

lib/Target/AMDGPU/VOP1Instructions.td

-rw-r--r--

lib/Target/AMDGPU/VOP2Instructions.td

-rw-r--r--

lib/Target/AMDGPU/VOP3Instructions.td

-rw-r--r--

lib/Target/AMDGPU/VOPCInstructions.td

-rw-r--r--

lib/Target/AMDGPU/VOPInstructions.td

114

30 files changed, 891 insertions, 200 deletions

diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index b279bd61e180..e7ebb37a9d62 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td

@@ -425,7 +425,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",

FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,

FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,

FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,

- FeatureFastFMAF32, FeatureDPP,

+ FeatureFastFMAF32, FeatureSDWA, FeatureDPP,

FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts

]

@@ -534,10 +534,12 @@ def AMDGPUAsmVariants {

int VOP3_ID = 1;

string SDWA = "SDWA";

int SDWA_ID = 2;

+ string SDWA9 = "SDWA9";

+ int SDWA9_ID = 3;

string DPP = "DPP";

- int DPP_ID = 3;

+ int DPP_ID = 4;

string Disable = "Disable";

- int Disable_ID = 4;

+ int Disable_ID = 5;

}

def DefaultAMDGPUAsmParserVariant : AsmParserVariant {

@@ -555,6 +557,12 @@ def SDWAAsmParserVariant : AsmParserVariant {

let Name = AMDGPUAsmVariants.SDWA;

}

+def SDWA9AsmParserVariant : AsmParserVariant {

+ let Variant = AMDGPUAsmVariants.SDWA9_ID;

+ let Name = AMDGPUAsmVariants.SDWA9;

def DPPAsmParserVariant : AsmParserVariant {

let Variant = AMDGPUAsmVariants.DPP_ID;

let Name = AMDGPUAsmVariants.DPP;

@@ -567,6 +575,7 @@ def AMDGPU : Target {

let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant,

VOP3AsmParserVariant,

SDWAAsmParserVariant,

+ SDWA9AsmParserVariant,

DPPAsmParserVariant];

let AssemblyWriters = [AMDGPUAsmWriter];

}

@@ -607,7 +616,10 @@ def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,

AssemblerPredicate<"FeatureVOP3P">;

def HasSDWA : Predicate<"Subtarget->hasSDWA()">,

- AssemblerPredicate<"FeatureSDWA">;

+ AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;

+def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">,

+ AssemblerPredicate<"FeatureSDWA,FeatureGFX9">;

def HasDPP : Predicate<"Subtarget->hasDPP()">,

AssemblerPredicate<"FeatureDPP">;

diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5ec46a8294c0..723e8a7b54e2 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

@@ -127,6 +127,29 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {

return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);

}

+bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op)

+ assert(Op.getOpcode() == ISD::OR);

+ SDValue N0 = Op->getOperand(0);

+ SDValue N1 = Op->getOperand(1);

+ EVT VT = N0.getValueType();

+ if (VT.isInteger() && !VT.isVector()) {

+ KnownBits LHSKnown, RHSKnown;

+ DAG.computeKnownBits(N0, LHSKnown);

+ if (LHSKnown.Zero.getBoolValue()) {

+ DAG.computeKnownBits(N1, RHSKnown);

+ if (!(~RHSKnown.Zero & ~LHSKnown.Zero))

+ return true;

+ }

+ return false;

AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

const AMDGPUSubtarget &STI)

: TargetLowering(TM), Subtarget(&STI) {

@@ -2596,8 +2619,6 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(

SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,

DAGCombinerInfo &DCI) const {

EVT VT = N->getValueType(0);

- if (VT != MVT::i64)

- return SDValue();

ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));

if (!RHS)

@@ -2618,6 +2639,8 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,

case ISD::SIGN_EXTEND:

case ISD::ANY_EXTEND: {

// shl (ext x) => zext (shl x), if shift does not overflow int

+ if (VT != MVT::i64)

+ break;

KnownBits Known;

SDValue X = LHS->getOperand(0);

DAG.computeKnownBits(X, Known);

@@ -2628,8 +2651,23 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,

SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));

return DAG.getZExtOrTrunc(Shl, SL, VT);

}

+ case ISD::OR: if (!isOrEquivalentToAdd(DAG, LHS)) break;

+ case ISD::ADD: { // Fall through from above

+ // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)

+ if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {

+ SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0),

+ SDValue(RHS, 0));

+ SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal,

+ SDLoc(C2), VT);

+ return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V);

+ }

+ break;

+ }

}

+ if (VT != MVT::i64)

+ return SDValue();

// i64 (shl x, C) -> (build_pair 0, (shl x, C -32))

// On some subtargets, 64-bit shift is a quarter rate instruction. In the

@@ -3440,7 +3478,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,

DL);

}

- if ((OffsetVal + WidthVal) >= 32) {

+ if ((OffsetVal + WidthVal) >= 32 &&

+ !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {

SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);

return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,

BitsFrom, ShiftVal);

diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fb2f15022d25..0d066cdbdff4 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h

@@ -34,6 +34,9 @@ private:

/// compare.

SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const;

+public:

+ static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op);

protected:

const AMDGPUSubtarget *Subtarget;

AMDGPUAS AMDGPUASI;

diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9de302994e68..57905be18813 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

@@ -36,6 +36,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {

setAction({G_CONSTANT, S32}, Legal);

setAction({G_CONSTANT, S64}, Legal);

+ setAction({G_FCONSTANT, S32}, Legal);

setAction({G_GEP, P1}, Legal);

setAction({G_GEP, P2}, Legal);

setAction({G_GEP, 1, S64}, Legal);

diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 85184b363905..07f92918a43f 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

@@ -97,6 +97,9 @@ private:

Instruction *UseInst,

int OpIdx0, int OpIdx1) const;

+ /// Check whether we have enough local memory for promotion.

+ bool hasSufficientLocalMem(const Function &F);

public:

static char ID;

@@ -107,7 +110,7 @@ public:

StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }

- void handleAlloca(AllocaInst &I);

+ bool handleAlloca(AllocaInst &I, bool SufficientLDS);

void getAnalysisUsage(AnalysisUsage &AU) const override {

AU.setPreservesCFG();

@@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {

const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);

if (!ST.isPromoteAllocaEnabled())

return false;

- AS = AMDGPU::getAMDGPUAS(*F.getParent());

- FunctionType *FTy = F.getFunctionType();

- // If the function has any arguments in the local address space, then it's

- // possible these arguments require the entire local memory space, so

- // we cannot use local memory in the pass.

- for (Type *ParamTy : FTy->params()) {

- PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);

- if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {

- LocalMemLimit = 0;

- DEBUG(dbgs() << "Function has local memory argument. Promoting to "

- "local memory disabled.\n");

- return false;

- }

- LocalMemLimit = ST.getLocalMemorySize();

- if (LocalMemLimit == 0)

- return false;

- const DataLayout &DL = Mod->getDataLayout();

- // Check how much local memory is being used by global objects

- CurrentLocalMemUsage = 0;

- for (GlobalVariable &GV : Mod->globals()) {

- if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)

- continue;

- for (const User *U : GV.users()) {

- const Instruction *Use = dyn_cast<Instruction>(U);

- if (!Use)

- continue;

- if (Use->getParent()->getParent() == &F) {

- unsigned Align = GV.getAlignment();

- if (Align == 0)

- Align = DL.getABITypeAlignment(GV.getValueType());

- // FIXME: Try to account for padding here. The padding is currently

- // determined from the inverse order of uses in the function. I'm not

- // sure if the use list order is in any way connected to this, so the

- // total reported size is likely incorrect.

- uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());

- CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);

- CurrentLocalMemUsage += AllocSize;

- break;

- }

- unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,

- F);

- // Restrict local memory usage so that we don't drastically reduce occupancy,

- // unless it is already significantly reduced.

- // TODO: Have some sort of hint or other heuristics to guess occupancy based

- // on other factors..

- unsigned OccupancyHint = ST.getWavesPerEU(F).second;

- if (OccupancyHint == 0)

- OccupancyHint = 7;

- // Clamp to max value.

- OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());

- // Check the hint but ignore it if it's obviously wrong from the existing LDS

- // usage.

- MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);

- // Round up to the next tier of usage.

- unsigned MaxSizeWithWaveCount

- = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);

- // Program is possibly broken by using more local mem than available.

- if (CurrentLocalMemUsage > MaxSizeWithWaveCount)

- return false;

- LocalMemLimit = MaxSizeWithWaveCount;

- DEBUG(

- dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"

- << " Rounding size to " << MaxSizeWithWaveCount

- << " with a maximum occupancy of " << MaxOccupancy << '\n'

- << " and " << (LocalMemLimit - CurrentLocalMemUsage)

- << " available for promotion\n"

- );

+ AS = AMDGPU::getAMDGPUAS(*F.getParent());

+ bool SufficientLDS = hasSufficientLocalMem(F);

+ bool Changed = false;

BasicBlock &EntryBB = *F.begin();

for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {

AllocaInst *AI = dyn_cast<AllocaInst>(I);

++I;

if (AI)

- handleAlloca(*AI);

+ Changed |= handleAlloca(*AI, SufficientLDS);

}

- return true;

+ return Changed;

}

std::pair<Value *, Value *>

@@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(

return true;

}

+bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {

+ FunctionType *FTy = F.getFunctionType();

+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);

+ // If the function has any arguments in the local address space, then it's

+ // possible these arguments require the entire local memory space, so

+ // we cannot use local memory in the pass.

+ for (Type *ParamTy : FTy->params()) {

+ PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);

+ if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {

+ LocalMemLimit = 0;

+ DEBUG(dbgs() << "Function has local memory argument. Promoting to "

+ "local memory disabled.\n");

+ return false;

+ }

+ LocalMemLimit = ST.getLocalMemorySize();

+ if (LocalMemLimit == 0)

+ return false;

+ const DataLayout &DL = Mod->getDataLayout();

+ // Check how much local memory is being used by global objects

+ CurrentLocalMemUsage = 0;

+ for (GlobalVariable &GV : Mod->globals()) {

+ if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)

+ continue;

+ for (const User *U : GV.users()) {

+ const Instruction *Use = dyn_cast<Instruction>(U);

+ if (!Use)

+ continue;

+ if (Use->getParent()->getParent() == &F) {

+ unsigned Align = GV.getAlignment();

+ if (Align == 0)

+ Align = DL.getABITypeAlignment(GV.getValueType());

+ // FIXME: Try to account for padding here. The padding is currently

+ // determined from the inverse order of uses in the function. I'm not

+ // sure if the use list order is in any way connected to this, so the

+ // total reported size is likely incorrect.

+ uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());

+ CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);

+ CurrentLocalMemUsage += AllocSize;

+ break;

+ }

+ unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,

+ F);

+ // Restrict local memory usage so that we don't drastically reduce occupancy,

+ // unless it is already significantly reduced.

+ // TODO: Have some sort of hint or other heuristics to guess occupancy based

+ // on other factors..

+ unsigned OccupancyHint = ST.getWavesPerEU(F).second;

+ if (OccupancyHint == 0)

+ OccupancyHint = 7;

+ // Clamp to max value.

+ OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());

+ // Check the hint but ignore it if it's obviously wrong from the existing LDS

+ // usage.

+ MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);

+ // Round up to the next tier of usage.

+ unsigned MaxSizeWithWaveCount

+ = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);

+ // Program is possibly broken by using more local mem than available.

+ if (CurrentLocalMemUsage > MaxSizeWithWaveCount)

+ return false;

+ LocalMemLimit = MaxSizeWithWaveCount;

+ DEBUG(

+ dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"

+ << " Rounding size to " << MaxSizeWithWaveCount

+ << " with a maximum occupancy of " << MaxOccupancy << '\n'

+ << " and " << (LocalMemLimit - CurrentLocalMemUsage)

+ << " available for promotion\n"

+ );

+ return true;

// FIXME: Should try to pick the most likely to be profitable allocas first.

-void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {

+bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {

// Array allocations are probably not worth handling, since an allocation of

// the array type is the canonical form.

if (!I.isStaticAlloca() || I.isArrayAllocation())

- return;

+ return false;

IRBuilder<> Builder(&I);

@@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {

DEBUG(dbgs() << "Trying to promote " << I << '\n');

- if (tryPromoteAllocaToVector(&I, AS)) {

- DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");

- return;

- }

+ if (tryPromoteAllocaToVector(&I, AS))

+ return true; // Promoted to vector.

const Function &ContainingFunction = *I.getParent()->getParent();

CallingConv::ID CC = ContainingFunction.getCallingConv();

@@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {

break;

default:

DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");

- return;

+ return false;

}

+ // Not likely to have sufficient local memory for promotion.

+ if (!SufficientLDS)

+ return false;

const AMDGPUSubtarget &ST =

TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);

unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;

@@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {

if (NewSize > LocalMemLimit) {

DEBUG(dbgs() << " " << AllocSize

<< " bytes of local memory not available to promote\n");

- return;

+ return false;

}

CurrentLocalMemUsage = NewSize;

@@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {

if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {

DEBUG(dbgs() << " Do not know how to convert all uses\n");

- return;

+ return false;

}

DEBUG(dbgs() << "Promoting alloca to local memory\n");

@@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {

llvm_unreachable("Don't know how to promote alloca intrinsic use.");

}

+ return true;

}

FunctionPass *llvm::createAMDGPUPromoteAlloca() {

diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index e543cae07ada..660879426810 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h

@@ -416,6 +416,10 @@ public:

return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;

}

+ bool hasSDWA() const {

+ return HasSDWA;

+ }

/// \brief Returns the offset in bytes from the start of the input buffer

/// of the first explicit kernel argument.

unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {

@@ -670,10 +674,6 @@ public:

return HasInv2PiInlineImm;

}

- bool hasSDWA() const {

- return HasSDWA;

- }

bool hasDPP() const {

return HasDPP;

}

diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b52ea2b3a2c6..f5541e08e1b7 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

@@ -881,6 +881,10 @@ public:

return AMDGPU::isVI(getSTI());

}

+ bool isGFX9() const {

+ return AMDGPU::isGFX9(getSTI());

+ }

bool hasInv2PiInlineImm() const {

return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];

}

@@ -989,7 +993,6 @@ private:

bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);

bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;

unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;

- bool isSGPR(unsigned Reg);

public:

OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);

@@ -1042,9 +1045,10 @@ public:

OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands);

void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);

void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);

+ void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands);

void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);

void cvtSDWA(MCInst &Inst, const OperandVector &Operands,

- uint64_t BasicInstType);

+ uint64_t BasicInstType, bool skipVcc = false);

};

struct OptionalOperand {

@@ -1966,7 +1970,8 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {

}

if (isForcedSDWA()) {

- static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA};

+ static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA,

+ AMDGPUAsmVariants::SDWA9};

return makeArrayRef(Variants);

}

@@ -1977,7 +1982,7 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {

static const unsigned Variants[] = {

AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,

- AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP

+ AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP

};

return makeArrayRef(Variants);

@@ -2000,14 +2005,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {

return AMDGPU::NoRegister;

}

-bool AMDGPUAsmParser::isSGPR(unsigned Reg) {

- const MCRegisterInfo *TRI = getContext().getRegisterInfo();

- const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);

- const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);

- return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||

- Reg == AMDGPU::SCC;

// NB: This code is correct only when used to check constant

// bus limitations because GFX7 support no f16 inline constants.

// Note that there are no cases when a GFX7 opcode violates

@@ -2049,7 +2046,8 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {

if (MO.isImm()) {

return !isInlineConstant(Inst, OpIdx);

}

- return !MO.isReg() || isSGPR(mc2PseudoReg(MO.getReg()));

+ return !MO.isReg() ||

+ isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());

}

bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {

@@ -2060,7 +2058,8 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {

if (Desc.TSFlags &

(SIInstrFlags::VOPC |

SIInstrFlags::VOP1 | SIInstrFlags::VOP2 |

- SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) {

+ SIInstrFlags::VOP3 | SIInstrFlags::VOP3P |

+ SIInstrFlags::SDWA)) {

// Check special imm operands (used by madmk, etc)

if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) {

@@ -4151,14 +4150,19 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {

cvtSDWA(Inst, Operands, SIInstrFlags::VOP2);

}

+void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) {

+ cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true);

void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {

- cvtSDWA(Inst, Operands, SIInstrFlags::VOPC);

+ cvtSDWA(Inst, Operands, SIInstrFlags::VOPC, isVI());

}

void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,

- uint64_t BasicInstType) {

+ uint64_t BasicInstType, bool skipVcc) {

using namespace llvm::AMDGPU::SDWA;

OptionalImmIndexMap OptionalIdx;

+ bool skippedVcc = false;

unsigned I = 1;

const MCInstrDesc &Desc = MII.get(Inst.getOpcode());

@@ -4168,15 +4172,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,

for (unsigned E = Operands.size(); I != E; ++I) {

AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);

- // Add the register arguments

- if ((BasicInstType == SIInstrFlags::VOPC ||

- BasicInstType == SIInstrFlags::VOP2)&&

- Op.isReg() &&

- Op.Reg.RegNo == AMDGPU::VCC) {

- // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.

- // Skip it.

- continue;

- } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {

+ if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {

+ // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.

+ // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3)

+ // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand.

+ // Skip VCC only if we didn't skip it on previous iteration.

+ if (BasicInstType == SIInstrFlags::VOP2 &&

+ (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) {

+ skippedVcc = true;

+ continue;

+ } else if (BasicInstType == SIInstrFlags::VOPC &&

+ Inst.getNumOperands() == 0) {

+ skippedVcc = true;

+ continue;

+ }

+ if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {

Op.addRegWithInputModsOperands(Inst, 2);

} else if (Op.isImm()) {

// Handle optional arguments

@@ -4184,20 +4195,30 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,

} else {

llvm_unreachable("Invalid operand type");

}

+ skippedVcc = false;

}

- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);

- if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {

+ if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&

+ Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {

// V_NOP_sdwa_vi has no optional sdwa arguments

switch (BasicInstType) {

case SIInstrFlags::VOP1:

+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);

+ if (isGFX9() &&

+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {

+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);

+ }

addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);

addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);

addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);

break;

case SIInstrFlags::VOP2:

+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);

+ if (isGFX9() &&

+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {

+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);

+ }

addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);

addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);

addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);

@@ -4205,6 +4226,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,

break;

case SIInstrFlags::VOPC:

+ if (isVI()) {

+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);

+ }

addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);

addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);

break;

@@ -4220,10 +4244,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,

Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) {

auto it = Inst.begin();

std::advance(

- it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));

+ it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));

Inst.insert(it, Inst.getOperand(0)); // src2 = dst

}

/// Force static initialization.

diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 137b5cca96ce..9b3cde7c4df6 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

@@ -62,32 +62,33 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,

return addOperand(Inst, MCOperand::createImm(Imm));

}

-#define DECODE_OPERAND2(RegClass, DecName) \

-static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \

- unsigned Imm, \

- uint64_t /*Addr*/, \

- const void *Decoder) { \

+#define DECODE_OPERAND(StaticDecoderName, DecoderName) \

+static DecodeStatus StaticDecoderName(MCInst &Inst, \

+ unsigned Imm, \

+ uint64_t /*Addr*/, \

+ const void *Decoder) { \

auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \

- return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \

+ return addOperand(Inst, DAsm->DecoderName(Imm)); \

}

-#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass)

+#define DECODE_OPERAND_REG(RegClass) \

+DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)

-DECODE_OPERAND(VGPR_32)

-DECODE_OPERAND(VS_32)

-DECODE_OPERAND(VS_64)

+DECODE_OPERAND_REG(VGPR_32)

+DECODE_OPERAND_REG(VS_32)

+DECODE_OPERAND_REG(VS_64)

-DECODE_OPERAND(VReg_64)

-DECODE_OPERAND(VReg_96)

-DECODE_OPERAND(VReg_128)

+DECODE_OPERAND_REG(VReg_64)

+DECODE_OPERAND_REG(VReg_96)

+DECODE_OPERAND_REG(VReg_128)

-DECODE_OPERAND(SReg_32)

-DECODE_OPERAND(SReg_32_XM0_XEXEC)

-DECODE_OPERAND(SReg_64)

-DECODE_OPERAND(SReg_64_XEXEC)

-DECODE_OPERAND(SReg_128)

-DECODE_OPERAND(SReg_256)

-DECODE_OPERAND(SReg_512)

+DECODE_OPERAND_REG(SReg_32)

+DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)

+DECODE_OPERAND_REG(SReg_64)

+DECODE_OPERAND_REG(SReg_64_XEXEC)

+DECODE_OPERAND_REG(SReg_128)

+DECODE_OPERAND_REG(SReg_256)

+DECODE_OPERAND_REG(SReg_512)

static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,

@@ -106,6 +107,13 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,

return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));

}

+#define DECODE_SDWA9(DecName) \

+DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName)

+DECODE_SDWA9(Src32)

+DECODE_SDWA9(Src16)

+DECODE_SDWA9(VopcDst)

#include "AMDGPUGenDisassemblerTables.inc"

//===----------------------------------------------------------------------===//

@@ -164,6 +172,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,

Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);

if (Res) break;

+ Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);

+ if (Res) break;

}

// Reinitialize Bytes as DPP64 could have eaten too much

@@ -582,6 +593,48 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {

return errOperand(Val, "unknown operand encoding " + Twine(Val));

}

+MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width,

+ unsigned Val) const {

+ using namespace AMDGPU::SDWA;

+ if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&

+ Val <= SDWA9EncValues::SRC_VGPR_MAX) {

+ return createRegOperand(getVgprClassId(Width),

+ Val - SDWA9EncValues::SRC_VGPR_MIN);

+ }

+ if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&

+ Val <= SDWA9EncValues::SRC_SGPR_MAX) {

+ return createSRegOperand(getSgprClassId(Width),

+ Val - SDWA9EncValues::SRC_SGPR_MIN);

+ }

+ return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);

+MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const {

+ return decodeSDWA9Src(OPW16, Val);

+MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const {

+ return decodeSDWA9Src(OPW32, Val);

+MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const {

+ using namespace AMDGPU::SDWA;

+ if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {

+ Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;

+ if (Val > AMDGPU::EncValues::SGPR_MAX) {

+ return decodeSpecialReg64(Val);

+ } else {

+ return createSRegOperand(getSgprClassId(OPW64), Val);

+ }

+ } else {

+ return createRegOperand(AMDGPU::VCC);

+ }

//===----------------------------------------------------------------------===//

// AMDGPUSymbolizer

//===----------------------------------------------------------------------===//

diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 620bae0a6d1a..0ff405a71e9b 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h

@@ -104,6 +104,11 @@ public:

MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;

MCOperand decodeSpecialReg32(unsigned Val) const;

MCOperand decodeSpecialReg64(unsigned Val) const;

+ MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const;

+ MCOperand decodeSDWA9Src16(unsigned Val) const;

+ MCOperand decodeSDWA9Src32(unsigned Val) const;

+ MCOperand decodeSDWA9VopcDst(unsigned Val) const;

};

//===----------------------------------------------------------------------===//

diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 3bb5c9bc22b7..8ead48067336 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp

@@ -191,6 +191,7 @@ public:

}

};

+namespace {

// just a stub to make base class happy

class SchedStrategyStub : public MachineSchedStrategy {

public:

@@ -202,6 +203,7 @@ public:

void releaseTopNode(SUnit *SU) override {}

void releaseBottomNode(SUnit *SU) override {}

};

+} // namespace

GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,

StrategyKind S)

diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index c6d0f2179950..d378df674be9 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp

@@ -17,6 +17,7 @@ using namespace llvm;

#define DEBUG_TYPE "misched"

+namespace {

class GCNMinRegScheduler {

struct Candidate : ilist_node<Candidate> {

const SUnit *SU;

@@ -71,6 +72,7 @@ public:

std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,

const ScheduleDAG &DAG);

};

+} // namespace

void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {

NumPreds.resize(SUnits.size());

diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 18374dca3f84..390a8286c76a 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp

@@ -211,9 +211,9 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO,

return getLiveLaneMask(MO.getReg(), SI, LIS, MRI);

}

-SmallVector<RegisterMaskPair, 8> collectVirtualRegUses(const MachineInstr &MI,

- const LiveIntervals &LIS,

- const MachineRegisterInfo &MRI) {

+static SmallVector<RegisterMaskPair, 8>

+collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,

+ const MachineRegisterInfo &MRI) {

SmallVector<RegisterMaskPair, 8> Res;

for (const auto &MO : MI.operands()) {

if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))

diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 3d3858ab47ec..a856b17a228f 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h

@@ -52,6 +52,18 @@ public:

return 0;

}

+ virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,

+ SmallVectorImpl<MCFixup> &Fixups,

+ const MCSubtargetInfo &STI) const {

+ return 0;

+ }

+ virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,

+ SmallVectorImpl<MCFixup> &Fixups,

+ const MCSubtargetInfo &STI) const {

+ return 0;

+ }

protected:

uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;

void verifyInstructionPredicates(const MCInst &MI,

diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index bda0928036fd..e02acf516c0d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp

@@ -69,6 +69,14 @@ public:

unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,

SmallVectorImpl<MCFixup> &Fixups,

const MCSubtargetInfo &STI) const override;

+ unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,

+ SmallVectorImpl<MCFixup> &Fixups,

+ const MCSubtargetInfo &STI) const override;

+ unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,

+ SmallVectorImpl<MCFixup> &Fixups,

+ const MCSubtargetInfo &STI) const override;

};

} // end anonymous namespace

@@ -319,6 +327,44 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,

return getMachineOpValue(MI, MO, Fixups, STI);

}

+unsigned

+SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,

+ SmallVectorImpl<MCFixup> &Fixups,

+ const MCSubtargetInfo &STI) const {

+ using namespace AMDGPU::SDWA;

+ uint64_t RegEnc = 0;

+ const MCOperand &MO = MI.getOperand(OpNo);

+ unsigned Reg = MO.getReg();

+ RegEnc |= MRI.getEncodingValue(Reg);

+ RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;

+ if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {

+ RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;

+ }

+ return RegEnc;

+unsigned

+SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,

+ SmallVectorImpl<MCFixup> &Fixups,

+ const MCSubtargetInfo &STI) const {

+ using namespace AMDGPU::SDWA;

+ uint64_t RegEnc = 0;

+ const MCOperand &MO = MI.getOperand(OpNo);

+ unsigned Reg = MO.getReg();

+ if (Reg != AMDGPU::VCC) {

+ RegEnc |= MRI.getEncodingValue(Reg);

+ RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;

+ RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK;

+ }

+ return RegEnc;

uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,

const MCOperand &MO,

SmallVectorImpl<MCFixup> &Fixups,

diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 3590a9b05e1d..60b913cfd39a 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp

@@ -1618,6 +1618,14 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,

return VT.changeVectorElementTypeToInteger();

}

+bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const {

+ // Local and Private addresses do not handle vectors. Limit to i32

+ if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {

+ return (MemVT.getSizeInBits() <= 32);

+ }

+ return true;

bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,

unsigned AddrSpace,

unsigned Align,

diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 9700ce14c6f3..d6a0876a6ee7 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h

@@ -44,6 +44,8 @@ public:

EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,

EVT VT) const override;

+ bool canMergeStoresTo(unsigned AS, EVT MemVT) const override;

bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,

unsigned Align,

bool *IsFast) const override;

diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
index cc667d985a82..3c1e8527284c 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/lib/Target/AMDGPU/R600RegisterInfo.td

@@ -226,7 +226,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add

R600_Addr,

R600_KC0, R600_KC1,

ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,

- ALU_CONST, ALU_PARAM, OQAP

+ ALU_CONST, ALU_PARAM, OQAP, INDIRECT_BASE_ADDR

)>;

def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add

diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a01330cb9171..80967edee0ab 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h

@@ -118,6 +118,10 @@ namespace AMDGPU {

// Operand for source modifiers for VOP instructions

OPERAND_INPUT_MODS,

+ // Operand for GFX9 SDWA instructions

+ OPERAND_SDWA9_SRC,

+ OPERAND_SDWA9_VOPC_DST,

/// Operand with 32-bit immediate that uses the constant bus.

OPERAND_KIMM32,

OPERAND_KIMM16

@@ -160,7 +164,8 @@ namespace AMDGPUAsmVariants {

DEFAULT = 0,

VOP3 = 1,

SDWA = 2,

- DPP = 3

+ SDWA9 = 3,

+ DPP = 4

};

}

@@ -294,6 +299,18 @@ enum DstUnused {

UNUSED_PRESERVE = 2,

};

+enum SDWA9EncValues{

+ SRC_SGPR_MASK = 0x100,

+ SRC_VGPR_MASK = 0xFF,

+ VOPC_DST_VCC_MASK = 0x80,

+ VOPC_DST_SGPR_MASK = 0x7F,

+ SRC_VGPR_MIN = 0,

+ SRC_VGPR_MAX = 255,

+ SRC_SGPR_MIN = 256,

+ SRC_SGPR_MAX = 357,

+};

} // namespace SDWA

} // namespace AMDGPU

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 01c1f78e7ca4..76c2644867aa 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp

@@ -698,6 +698,18 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,

}

+bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const {

+ if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {

+ return (MemVT.getSizeInBits() <= 4 * 32);

+ } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {

+ unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();

+ return (MemVT.getSizeInBits() <= MaxPrivateBits);

+ } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {

+ return (MemVT.getSizeInBits() <= 2 * 32);

+ }

+ return true;

bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,

unsigned AddrSpace,

unsigned Align,

@@ -4229,12 +4241,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,

SDValue RHS = N->getOperand(1);

- if (VT == MVT::i64) {

- const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);

- if (CRHS) {

- if (SDValue Split

- = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))

- return Split;

+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);

+ if (VT == MVT::i64 && CRHS) {

+ if (SDValue Split

+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))

+ return Split;

+ }

+ if (CRHS && VT == MVT::i32) {

+ // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb

+ // nb = number of trailing zeroes in mask

+ // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,

+ // given that we are selecting 8 or 16 bit fields starting at byte boundary.

+ uint64_t Mask = CRHS->getZExtValue();

+ unsigned Bits = countPopulation(Mask);

+ if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&

+ (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {

+ if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {

+ unsigned Shift = CShift->getZExtValue();

+ unsigned NB = CRHS->getAPIntValue().countTrailingZeros();

+ unsigned Offset = NB + Shift;

+ if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.

+ SDLoc SL(N);

+ SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,

+ LHS->getOperand(0),

+ DAG.getConstant(Offset, SL, MVT::i32),

+ DAG.getConstant(Bits, SL, MVT::i32));

+ EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);

+ SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,

+ DAG.getValueType(NarrowVT));

+ SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,

+ DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));

+ return Shl;

+ }

}

diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index e68837747491..8e2ec40b224c 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h

@@ -150,6 +150,8 @@ public:

bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,

unsigned AS) const override;

+ bool canMergeStoresTo(unsigned AS, EVT MemVT) const override;

bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,

unsigned Align,

bool *IsFast) const override;

diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 38a16b525a75..36d29b8ecf06 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp

@@ -2331,6 +2331,10 @@ static bool isSubRegOf(const SIRegisterInfo &TRI,

bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,

StringRef &ErrInfo) const {

uint16_t Opcode = MI.getOpcode();

+ if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))

+ return true;

const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);

int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);

diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 7b052844f177..c5287c7f64ba 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td

@@ -439,6 +439,27 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {

let ParserMatchClass = VReg32OrOffClass;

}

+class SDWA9Src : RegisterOperand<VS_32> {

+ let OperandNamespace = "AMDGPU";

+ let OperandType = "OPERAND_SDWA9_SRC";

+ let EncoderMethod = "getSDWA9SrcEncoding";

+def SDWA9Src32 : SDWA9Src {

+ let DecoderMethod = "decodeSDWA9Src32";

+def SDWA9Src16 : SDWA9Src {

+ let DecoderMethod = "decodeSDWA9Src16";

+def SDWA9VopcDst : VOPDstOperand<SReg_64> {

+ let OperandNamespace = "AMDGPU";

+ let OperandType = "OPERAND_SDWA9_VOPC_DST";

+ let EncoderMethod = "getSDWA9VopcDstEncoding";

+ let DecoderMethod = "decodeSDWA9VopcDst";

class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {

let Name = "Imm"#CName;

let PredicateMethod = "is"#CName;

@@ -588,6 +609,16 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>

def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;

def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;

+def FPRegInputModsMatchClass : AsmOperandClass {

+ let Name = "RegWithFPInputMods";

+ let ParserMethod = "parseRegWithFPInputMods";

+ let PredicateMethod = "isRegKind";

+def FPRegInputMods : InputMods <FPRegInputModsMatchClass> {

+ let PrintMethod = "printOperandAndFPInputMods";

def FPVRegInputModsMatchClass : AsmOperandClass {

let Name = "VRegWithFPInputMods";

let ParserMethod = "parseRegWithFPInputMods";

@@ -598,6 +629,17 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {

let PrintMethod = "printOperandAndFPInputMods";

}

+def IntRegInputModsMatchClass : AsmOperandClass {

+ let Name = "RegWithIntInputMods";

+ let ParserMethod = "parseRegWithIntInputMods";

+ let PredicateMethod = "isRegKind";

+def IntRegInputMods : InputMods <IntRegInputModsMatchClass> {

+ let PrintMethod = "printOperandAndIntInputMods";

def IntVRegInputModsMatchClass : AsmOperandClass {

let Name = "VRegWithIntInputMods";

let ParserMethod = "parseRegWithIntInputMods";

@@ -783,6 +825,14 @@ class getVALUDstForVT<ValueType VT> {

VOPDstOperand<SReg_64>)))); // else VT == i1

}

+// Returns the register class to use for the destination of VOP[12C]

+// instructions with GFX9 SDWA extension

+class getSDWA9DstForVT<ValueType VT> {

+ RegisterOperand ret = !if(!eq(VT.Size, 1),

+ SDWA9VopcDst, // VOPC

+ VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst

// Returns the register class to use for source 0 of VOP[12C]

// instructions for the given VT.

class getVOPSrc0ForVT<ValueType VT> {

@@ -823,6 +873,9 @@ class getVregSrcForVT<ValueType VT> {

!if(!eq(VT.Size, 64), VReg_64, VGPR_32));

}

+class getSDWA9SrcForVT <ValueType VT> {

+ RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32);

// Returns the register class to use for sources of VOP3 instructions for the

// given VT.

@@ -926,6 +979,15 @@ class getSrcModExt <ValueType VT> {

Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);

}

+// Return type of input modifiers operand specified input operand for SDWA 9

+class getSrcModSDWA9 <ValueType VT> {

+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,

+ !if(!eq(VT.Value, f32.Value), 1,

+ !if(!eq(VT.Value, f64.Value), 1,

+ 0)));

+ Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods);

// Returns the input arguments for VOP[12C] instructions for the given SrcVT.

class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {

dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1

@@ -1062,6 +1124,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,

// VOP1 without input operands (V_NOP)

(ins),

!if(!eq(NumSrcArgs, 1),

+ // VOP1_SDWA

(ins Src0Mod:$src0_modifiers, Src0RC:$src0,

clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,

src0_sel:$src0_sel),

@@ -1071,7 +1134,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,

(ins Src0Mod:$src0_modifiers, Src0RC:$src0,

Src1Mod:$src1_modifiers, Src1RC:$src1,

clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),

- // VOP2_SDWA or VOPC_SDWA with modifiers

+ // VOP2_SDWA with modifiers

(ins Src0Mod:$src0_modifiers, Src0RC:$src0,

Src1Mod:$src1_modifiers, Src1RC:$src1,

clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,

@@ -1079,12 +1142,65 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,

(ins)/* endif */)));

}

+// Ins for GFX9 SDWA

+class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,

+ bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,

+ ValueType DstVT> {

+ dag ret = !if(!eq(NumSrcArgs, 0),

+ // VOP1 without input operands (V_NOP)

+ (ins),

+ !if(!eq(NumSrcArgs, 1),

+ // VOP1

+ !if(!eq(HasSDWAOMod, 0),

+ // VOP1_SDWA9 without omod

+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,

+ clampmod:$clamp,

+ dst_sel:$dst_sel, dst_unused:$dst_unused,

+ src0_sel:$src0_sel),

+ // VOP1_SDWA9 with omod

+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,

+ clampmod:$clamp, omod:$omod,

+ dst_sel:$dst_sel, dst_unused:$dst_unused,

+ src0_sel:$src0_sel)),

+ !if(!eq(NumSrcArgs, 2),

+ !if(!eq(DstVT.Size, 1),

+ // VOPC_SDWA9

+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,

+ Src1Mod:$src1_modifiers, Src1RC:$src1,

+ src0_sel:$src0_sel, src1_sel:$src1_sel),

+ // VOP2_SDWA9

+ !if(!eq(HasSDWAOMod, 0),

+ // VOP2_SDWA9 without omod

+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,

+ Src1Mod:$src1_modifiers, Src1RC:$src1,

+ clampmod:$clamp,

+ dst_sel:$dst_sel, dst_unused:$dst_unused,

+ src0_sel:$src0_sel, src1_sel:$src1_sel),

+ // VOP1_SDWA9 with omod

+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,

+ Src1Mod:$src1_modifiers, Src1RC:$src1,

+ clampmod:$clamp, omod:$omod,

+ dst_sel:$dst_sel, dst_unused:$dst_unused,

+ src0_sel:$src0_sel, src1_sel:$src1_sel))),

+ (ins)/* endif */)));

// Outs for DPP and SDWA

-class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {

+class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {

dag ret = !if(HasDst,

!if(!eq(DstVT.Size, 1),

(outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions

- (outs DstRCDPP:$vdst)),

+ (outs DstRCExt:$vdst)),

+ (outs)); // V_NOP

+// Outs for GFX9 SDWA

+class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> {

+ dag ret = !if(HasDst,

+ !if(!eq(DstVT.Size, 1),

+ (outs DstRCSDWA9:$sdst),

+ (outs DstRCSDWA9:$vdst)),

(outs)); // V_NOP

}

@@ -1153,8 +1269,7 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =

string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";

}

-class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,

- ValueType DstVT = i32> {

+class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {

string dst = !if(HasDst,

!if(!eq(DstVT.Size, 1),

" vcc", // use vcc token as dst for VOPC instructioins

@@ -1182,6 +1297,35 @@ class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,

string ret = dst#args#sdwa;

}

+class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs,

+ ValueType DstVT = i32> {

+ string dst = !if(HasDst,

+ !if(!eq(DstVT.Size, 1),

+ "$sdst", // VOPC

+ "$vdst"), // VOP1/2

+ "");

+ string src0 = "$src0_modifiers";

+ string src1 = "$src1_modifiers";

+ string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod");

+ string args = !if(!eq(NumSrcArgs, 0), "",

+ !if(!eq(NumSrcArgs, 1),

+ ", "#src0,

+ ", "#src0#", "#src1

+ )

+ );

+ string sdwa = !if(!eq(NumSrcArgs, 0), "",

+ !if(!eq(NumSrcArgs, 1),

+ out_mods#" $dst_sel $dst_unused $src0_sel",

+ !if(!eq(DstVT.Size, 1),

+ " $src0_sel $src1_sel", // No dst_sel, dst_unused and output modifiers for VOPC

+ out_mods#" $dst_sel $dst_unused $src0_sel $src1_sel"

+ )

+ );

+ string ret = dst#args#sdwa;

// Function that checks if instruction supports DPP and SDWA

class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,

ValueType Src1VT = i32> {

@@ -1219,6 +1363,7 @@ class VOPProfile <list<ValueType> _ArgVT> {

field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;

field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;

field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;

+ field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret;

field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;

field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;

field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;

@@ -1228,6 +1373,8 @@ class VOPProfile <list<ValueType> _ArgVT> {

field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;

field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;

field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;

+ field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;

+ field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;

field Operand Src0Mod = getSrcMod<Src0VT>.ret;

field Operand Src1Mod = getSrcMod<Src1VT>.ret;

field Operand Src2Mod = getSrcMod<Src2VT>.ret;

@@ -1235,6 +1382,8 @@ class VOPProfile <list<ValueType> _ArgVT> {

field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;

field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;

field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;

+ field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret;

+ field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret;

field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);

@@ -1261,14 +1410,16 @@ class VOPProfile <list<ValueType> _ArgVT> {

field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);

field bit HasClamp = HasModifiers;

- field bit HasSDWAClamp = HasSrc0;

+ field bit HasSDWAClamp = EmitDst;

field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;

field bit IsPacked = isPackedType<Src0VT>.ret;

field bit HasOpSel = IsPacked;

field bit HasOMod = !if(HasOpSel, 0, HasModifiers);

+ field bit HasSDWAOMod = isFloatType<DstVT>.ret;

field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;

+ field bit HasSDWA9 = HasExt;

field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);

field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);

@@ -1282,6 +1433,7 @@ class VOPProfile <list<ValueType> _ArgVT> {

field dag Outs64 = Outs;

field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;

field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;

+ field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret;

field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;

field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,

@@ -1296,16 +1448,21 @@ class VOPProfile <list<ValueType> _ArgVT> {

field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,

HasModifiers, Src0ModSDWA, Src1ModSDWA,

DstVT>.ret;

+ field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs,

+ HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9,

+ DstVT>.ret;

field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;

field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;

field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;

field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;

- field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;

+ field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;

+ field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;

}

class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {

let HasExt = 0;

+ let HasSDWA9 = 0;

}

def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;

@@ -1446,6 +1603,15 @@ def getSDWAOp : InstrMapping {

let ValueCols = [["SDWA"]];

}

+// Maps ordinary instructions to their SDWA GFX9 counterparts

+def getSDWA9Op : InstrMapping {

+ let FilterClass = "VOP";

+ let RowFields = ["OpName"];

+ let ColFields = ["AsmVariantName"];

+ let KeyCol = ["Default"];

+ let ValueCols = [["SDWA9"]];

def getMaskedMIMGOp : InstrMapping {

let FilterClass = "MIMG_Mask";

let RowFields = ["Op"];

diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index f2d8b6f7b7a4..ec29a66c8bbb 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td

@@ -184,7 +184,9 @@ def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">;

def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">;

def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">;

def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;

-def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">;

+def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64",

+ [(set i64:$sdst, (int_amdgcn_s_getpc))]

+>;

let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {

diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2abd4afad3b6..630f469eabf0 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

@@ -544,6 +544,17 @@ bool isVI(const MCSubtargetInfo &STI) {

return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];

}

+bool isGFX9(const MCSubtargetInfo &STI) {

+ return STI.getFeatureBits()[AMDGPU::FeatureGFX9];

+bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {

+ const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);

+ const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);

+ return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||

+ Reg == AMDGPU::SCC;

unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {

switch(Reg) {

diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 8e74aa2cc9a8..19888ad7556a 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

@@ -273,6 +273,10 @@ inline bool isKernel(CallingConv::ID CC) {

bool isSI(const MCSubtargetInfo &STI);

bool isCI(const MCSubtargetInfo &STI);

bool isVI(const MCSubtargetInfo &STI);

+bool isGFX9(const MCSubtargetInfo &STI);

+/// \brief Is Reg - scalar register

+bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);

/// If \p Reg is a pseudo reg, return the correct hardware register given

/// \p STI otherwise return \p Reg.

diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 1febc6bf8ec2..95b5ef0a49db 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td

@@ -30,6 +30,15 @@ class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe {

let Inst{31-25} = 0x3f; // encoding

}

+class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae {

+ bits<8> vdst;

+ let Inst{8-0} = 0xf9; // sdwa

+ let Inst{16-9} = op;

+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);

+ let Inst{31-25} = 0x3f; // encoding

class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :

InstSI <P.Outs32, P.Ins32, "", pattern>,

VOP <opName>,

@@ -84,6 +93,11 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :

let AsmMatchConverter = "cvtSdwaVOP1";

}

+class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :

+ VOP_SDWA9_Pseudo <OpName, P, pattern> {

+ let AsmMatchConverter = "cvtSdwaVOP1";

class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {

list<dag> ret =

!if(P.HasModifiers,

@@ -103,6 +117,7 @@ multiclass VOP1Inst <string opName, VOPProfile P,

def _e32 : VOP1_Pseudo <opName, P>;

def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;

def _sdwa : VOP1_SDWA_Pseudo <opName, P>;

+ def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>;

}

// Special profile for instructions which have clamp

@@ -243,6 +258,7 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {

let Src0RC64 = VRegSrc_32;

let HasExt = 0;

+ let HasSDWA9 = 0;

}

// Special case because there are no true output operands. Hack vdst

@@ -258,16 +274,21 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {

let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);

let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,

bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);

- let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,

+ let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,

clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,

src0_sel:$src0_sel);

+ let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,

+ clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,

+ src0_sel:$src0_sel);

let Asm32 = getAsm32<1, 1>.ret;

let Asm64 = getAsm64<1, 1, 0, 1>.ret;

let AsmDPP = getAsmDPP<1, 1, 0>.ret;

- let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;

+ let AsmSDWA = getAsmSDWA<1, 1>.ret;

+ let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret;

let HasExt = 0;

+ let HasSDWA9 = 0;

let HasDst = 0;

let EmitDst = 1; // force vdst emission

}

@@ -324,7 +345,7 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;

} // End SubtargetPredicate = isCIVI

-let SubtargetPredicate = isVI in {

+let SubtargetPredicate = Has16BitInsts in {

defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;

defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;

@@ -347,7 +368,7 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;

}

-let Predicates = [isVI] in {

+let Predicates = [Has16BitInsts] in {

def : Pat<

(f32 (f16_to_fp i16:$src)),

@@ -523,6 +544,10 @@ multiclass VOP1_Real_vi <bits<10> op> {

VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,

VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;

+ def _sdwa_gfx9 :

+ VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>,

+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;

// For now left dpp only for asm/dasm

// TODO: add corresponding pseudo

def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;

diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 4a11d9471f1d..657cacaa792c 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td

@@ -48,6 +48,18 @@ class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe {

let Inst{31} = 0x0; // encoding

}

+class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae {

+ bits<8> vdst;

+ bits<9> src1;

+ let Inst{8-0} = 0xf9; // sdwa

+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);

+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);

+ let Inst{30-25} = op;

+ let Inst{31} = 0x0; // encoding

+ let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr

class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :

InstSI <P.Outs32, P.Ins32, "", pattern>,

VOP <opName>,

@@ -102,6 +114,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :

let AsmMatchConverter = "cvtSdwaVOP2";

}

+class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :

+ VOP_SDWA9_Pseudo <OpName, P, pattern> {

+ let AsmMatchConverter = "cvtSdwaVOP2";

class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {

list<dag> ret = !if(P.HasModifiers,

[(set P.DstVT:$vdst,

@@ -121,10 +138,10 @@ multiclass VOP2Inst <string opName,

def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,

Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;

- def _sdwa : VOP2_SDWA_Pseudo <opName, P>;

+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>;

+ def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>;

}

-// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst

multiclass VOP2bInst <string opName,

VOPProfile P,

SDPatternOperator node = null_frag,

@@ -136,7 +153,13 @@ multiclass VOP2bInst <string opName,

def _e32 : VOP2_Pseudo <opName, P>,

Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;

- def _sdwa : VOP2_SDWA_Pseudo <opName, P>;

+ def _sdwa : VOP2_SDWA_Pseudo <opName, P> {

+ let AsmMatchConverter = "cvtSdwaVOP2b";

+ }

+ def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> {

+ let AsmMatchConverter = "cvtSdwaVOP2b";

+ }

}

def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,

@@ -203,13 +226,21 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {

VGPR_32:$src2, // stub argument

clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,

src0_sel:$src0_sel, src1_sel:$src1_sel);

+ let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,

+ Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,

+ VGPR_32:$src2, // stub argument

+ clampmod:$clamp, omod:$omod,

+ dst_sel:$dst_sel, dst_unused:$dst_unused,

+ src0_sel:$src0_sel, src1_sel:$src1_sel);

let Asm32 = getAsm32<1, 2, vt>.ret;

let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;

let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;

- let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret;

+ let AsmSDWA = getAsmSDWA<1, 2, vt>.ret;

+ let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;

let HasSrc2 = 0;

let HasSrc2Mods = 0;

let HasExt = 1;

+ let HasSDWA9 = 0;

}

def VOP_MAC_F16 : VOP_MAC <f16> {

@@ -229,6 +260,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {

let Asm32 = "$vdst, vcc, $src0, $src1";

let Asm64 = "$vdst, $sdst, $src0, $src1";

let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";

+ let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";

let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";

let Outs32 = (outs DstRC:$vdst);

let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);

@@ -246,6 +278,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {

let Asm32 = "$vdst, vcc, $src0, $src1, vcc";

let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";

let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";

+ let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";

let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";

let Outs32 = (outs DstRC:$vdst);

let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);

@@ -254,16 +287,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {

// implicit VCC use.

let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);

- let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0,

- Src1Mod:$src1_modifiers, Src1SDWA:$src1,

+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,

+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,

clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,

src0_sel:$src0_sel, src1_sel:$src1_sel);

+ let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,

+ Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,

+ clampmod:$clamp, omod:$omod,

+ dst_sel:$dst_sel, dst_unused:$dst_unused,

+ src0_sel:$src0_sel, src1_sel:$src1_sel);

let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,

Src1Mod:$src1_modifiers, Src1DPP:$src1,

dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,

bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);

let HasExt = 1;

+ let HasSDWA9 = 1;

}

// Read in from vcc or arbitrary SGPR

@@ -387,7 +427,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;

} // End let SubtargetPredicate = SICI

-let SubtargetPredicate = isVI in {

+let SubtargetPredicate = Has16BitInsts in {

def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;

defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;

@@ -418,7 +458,7 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;

}

} // End isCommutable = 1

-} // End SubtargetPredicate = isVI

+} // End SubtargetPredicate = Has16BitInsts

// Note: 16-bit instructions produce a 0 result in the high 16-bits.

multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {

@@ -468,7 +508,7 @@ class ZExt_i16_i1_Pat <SDNode ext> : Pat <

(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)

-let Predicates = [isVI] in {

+let Predicates = [Has16BitInsts] in {

defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;

defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;

@@ -513,7 +553,7 @@ def : Pat<

(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)

-} // End Predicates = [isVI]

+} // End Predicates = [Has16BitInsts]

//===----------------------------------------------------------------------===//

// SI

@@ -686,15 +726,21 @@ multiclass VOP2_SDWA_Real <bits<6> op> {

VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;

}

+multiclass VOP2_SDWA9_Real <bits<6> op> {

+ def _sdwa_gfx9 :

+ VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>,

+ VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;

multiclass VOP2be_Real_e32e64_vi <bits<6> op> :

- Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {

+ Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {

// For now left dpp only for asm/dasm

// TODO: add corresponding pseudo

def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;

}

multiclass VOP2_Real_e32e64_vi <bits<6> op> :

- Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {

+ Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {

// For now left dpp only for asm/dasm

// TODO: add corresponding pseudo

def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;

diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index c0b5069948fb..001fc960b228 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td

@@ -243,7 +243,7 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;

} // End SubtargetPredicate = isCIVI

-let SubtargetPredicate = isVI in {

+let SubtargetPredicate = Has16BitInsts in {

let isCommutable = 1 in {

@@ -258,12 +258,13 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>;

def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;

} // End isCommutable = 1

+} // End SubtargetPredicate = Has16BitInsts

+let SubtargetPredicate = isVI in {

def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;

} // End SubtargetPredicate = isVI

-let Predicates = [isVI] in {

+let Predicates = [Has16BitInsts] in {

multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,

Instruction inst, SDPatternOperator op3> {

@@ -288,7 +289,7 @@ def : Pat<

defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;

defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;

-} // End Predicates = [isVI]

+} // End Predicates = [Has16BitInsts]

let SubtargetPredicate = isGFX9 in {

def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>;

diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index a3550a63677b..cd347b86d305 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td

@@ -34,6 +34,17 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe {

let Inst{44-43} = SDWA.UNUSED_PRESERVE;

}

+class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be {

+ bits<9> src1;

+ let Inst{8-0} = 0xf9; // sdwa

+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);

+ let Inst{24-17} = op;

+ let Inst{31-25} = 0x3e; // encoding

+ let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr

//===----------------------------------------------------------------------===//

// VOPC classes

//===----------------------------------------------------------------------===//

@@ -102,6 +113,11 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :

let AsmMatchConverter = "cvtSdwaVOPC";

}

+class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :

+ VOP_SDWA9_Pseudo <OpName, P, pattern> {

+ let AsmMatchConverter = "cvtSdwaVOPC";

// This class is used only with VOPC instructions. Use $sdst for out operand

class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :

InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {

@@ -173,6 +189,13 @@ multiclass VOPC_Pseudos <string opName,

let isConvergent = DefExec;

let isCompare = 1;

}

+ def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> {

+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);

+ let SchedRW = P.Schedule;

+ let isConvergent = DefExec;

+ let isCompare = 1;

+ }

}

def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;

@@ -520,7 +543,11 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :

let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,

Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,

clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);

+ let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,

+ Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,

+ src0_sel:$src0_sel, src1_sel:$src1_sel);

let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";

+ //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel";

let HasSrc1Mods = 0;

let HasClamp = 0;

let HasOMod = 0;

@@ -553,6 +580,12 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {

let SchedRW = p.Schedule;

let isConvergent = DefExec;

}

+ def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> {

+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);

+ let SchedRW = p.Schedule;

+ let isConvergent = DefExec;

+ }

}

def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;

@@ -920,6 +953,10 @@ multiclass VOPC_Real_vi <bits<10> op> {

VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,

VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;

+ def _sdwa_gfx9 :

+ VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>,

+ VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;

def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),

!cast<Instruction>(NAME#"_e32_vi")> {

let AssemblerPredicate = isVI;

diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 69906c419db3..4da654f84f9d 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td

@@ -293,11 +293,52 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {

let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);

let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);

let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);

- let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);

let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);

+ let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);

let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);

+ let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);

let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);

+// gfx9 SDWA basic encoding

+class VOP_SDWA9e<VOPProfile P> : Enc64 {

+ bits<9> src0; // {src0_sgpr{0}, src0{7-0}}

+ bits<3> src0_sel;

+ bits<2> src0_modifiers; // float: {abs,neg}, int {sext}

+ bits<3> src1_sel;

+ bits<2> src1_modifiers;

+ bits<1> src1_sgpr;

+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);

+ let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);

+ let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);

+ let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);

+ let Inst{55} = !if(P.HasSrc0, src0{8}, 0);

+ let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);

let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);

+ let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);

+ let Inst{63} = 0; // src1_sgpr - should be specified in subclass

+// gfx9 SDWA-A

+class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e {

+ bits<3> dst_sel;

+ bits<2> dst_unused;

+ bits<1> clamp;

+ bits<2> omod;

+ let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);

+ let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);

+ let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);

+ let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0);

+// gfx9 SDWA-B

+class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e {

+ bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}}

+ let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0);

+ let Inst{47} = !if(P.EmitDst, sdst{7}, 0);

}

class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :

@@ -331,6 +372,50 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :

VOPProfile Pfl = P;

}

+// GFX9 adds two features to SDWA:

+// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.

+// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather

+// than VGPRs (at most 1 can be an SGPR);

+// b. OMOD is the standard output modifier (result *2, *4, /2)

+// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This

+// replaces OMOD and the dest fields with SD and SDST (SGPR destination)

+// field.

+// a. When SD=1, the SDST is used as the destination for the compare result;

+// b.when SD=0, VCC is used.

+//

+// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA

+class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :

+ InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>,

+ VOP <opName>,

+ SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>,

+ MnemonicAlias <opName#"_sdwa9", opName> {

+ let isPseudo = 1;

+ let isCodeGenOnly = 1;

+ let UseNamedOperandTable = 1;

+ string Mnemonic = opName;

+ string AsmOperands = P.AsmSDWA9;

+ let Size = 8;

+ let mayLoad = 0;

+ let mayStore = 0;

+ let hasSideEffects = 0;

+ let VALU = 1;

+ let SDWA = 1;

+ let Uses = [EXEC];

+ let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);

+ let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);

+ let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9,

+ AMDGPUAsmVariants.Disable);

+ let DecoderNamespace = "SDWA9";

+ VOPProfile Pfl = P;

class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :

InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,

SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {

@@ -358,6 +443,33 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :

let TSFlags = ps.TSFlags;

}

+class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :

+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,

+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {

+ let isPseudo = 0;

+ let isCodeGenOnly = 0;

+ let Defs = ps.Defs;

+ let Uses = ps.Uses;

+ let SchedRW = ps.SchedRW;

+ let hasSideEffects = ps.hasSideEffects;

+ let Constraints = ps.Constraints;

+ let DisableEncoding = ps.DisableEncoding;

+ // Copy relevant pseudo op flags

+ let SubtargetPredicate = ps.SubtargetPredicate;

+ let AssemblerPredicate = ps.AssemblerPredicate;

+ let AsmMatchConverter = ps.AsmMatchConverter;

+ let AsmVariantName = ps.AsmVariantName;

+ let UseNamedOperandTable = ps.UseNamedOperandTable;

+ let DecoderNamespace = ps.DecoderNamespace;

+ let Constraints = ps.Constraints;

+ let DisableEncoding = ps.DisableEncoding;

+ let TSFlags = ps.TSFlags;

class VOP_DPPe<VOPProfile P> : Enc64 {

bits<2> src0_modifiers;

bits<8> src0;