summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td47
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp23
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h25
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.cpp11
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td10
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp5
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h25
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp22
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h6
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp151
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td411
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp100
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h10
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td70
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp34
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h8
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp48
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h12
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp64
-rw-r--r--lib/Target/AMDGPU/Processors.td4
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp24
-rw-r--r--lib/Target/AMDGPU/SIDefines.h6
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp30
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp12
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp83
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h4
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp261
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h5
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp71
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h3
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td195
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp11
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h14
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp100
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp40
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h7
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp2
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp21
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h3
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td16
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td68
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td28
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td88
45 files changed, 1606 insertions, 590 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 6ab2b9ef04598..7494e5decd6f6 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -238,6 +238,36 @@ def FeatureSDWA : SubtargetFeature<"sdwa",
"Support SDWA (Sub-DWORD Addressing) extension"
>;
+def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod",
+ "HasSDWAOmod",
+ "true",
+ "Support OMod with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar",
+ "HasSDWAScalar",
+ "true",
+ "Support scalar register with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst",
+ "HasSDWASdst",
+ "true",
+ "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
+ "HasSDWAMac",
+ "true",
+ "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc",
+ "HasSDWAClampVOPC",
+ "true",
+ "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
+>;
+
def FeatureDPP : SubtargetFeature<"dpp",
"HasDPP",
"true",
@@ -421,8 +451,8 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
- FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA,
- FeatureDPP
+ FeatureScalarStores, FeatureInv2PiInlineImm,
+ FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP
]
>;
@@ -432,7 +462,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
- FeatureFastFMAF32, FeatureSDWA, FeatureDPP,
+ FeatureFastFMAF32, FeatureDPP,
+ FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
]
>;
@@ -449,14 +480,14 @@ class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
[FeatureSouthernIslands,
- FeatureFastFMAF32,
+ FeatureFastFMAF32,
HalfRate64Ops,
FeatureLDSBankCount32]>;
def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
[FeatureSouthernIslands,
FeatureLDSBankCount32]>;
-
+
def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
[FeatureSeaIslands,
FeatureLDSBankCount32]>;
@@ -644,7 +675,11 @@ def isCIVI : Predicate <
"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<"FeatureCIInsts">;
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
+ AssemblerPredicate<"FeatureFlatAddressSpace">;
+
+def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
+ AssemblerPredicate<"FeatureFlatGlobalInsts">;
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
AssemblerPredicate<"Feature16BitInsts">;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5586b513b5fca..96f819fd0e684 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3527,18 +3527,25 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
//===----------------------------------------------------------------------===//
SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
- const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const {
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT,
+ const SDLoc &SL,
+ bool RawReg) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned VirtualRegister;
+ unsigned VReg;
+
if (!MRI.isLiveIn(Reg)) {
- VirtualRegister = MRI.createVirtualRegister(RC);
- MRI.addLiveIn(Reg, VirtualRegister);
+ VReg = MRI.createVirtualRegister(RC);
+ MRI.addLiveIn(Reg, VReg);
} else {
- VirtualRegister = MRI.getLiveInVirtReg(Reg);
+ VReg = MRI.getLiveInVirtReg(Reg);
}
- return DAG.getRegister(VirtualRegister, VT);
+
+ if (RawReg)
+ return DAG.getRegister(VReg, VT);
+
+ return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
}
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
@@ -3657,6 +3664,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(STORE_MSKOR)
NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+ NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+ NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
NODE_NAME_CASE(ATOMIC_INC)
NODE_NAME_CASE(ATOMIC_DEC)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 0d066cdbdff4d..a45234e2b39f2 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -216,10 +216,25 @@ public:
/// \brief Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
///
- /// \returns a RegisterSDNode representing Reg.
- virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
- const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const;
+ /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
+ /// a copy from the register.
+ SDValue CreateLiveInRegister(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT,
+ const SDLoc &SL,
+ bool RawReg = false) const;
+ SDValue CreateLiveInRegister(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const {
+ return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
+ }
+
+ // Returns the raw live in register rather than a copy from it.
+ SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const {
+ return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
+ }
enum ImplicitParameter {
FIRST_IMPLICIT,
@@ -388,6 +403,8 @@ enum NodeType : unsigned {
STORE_MSKOR,
LOAD_CONSTANT,
TBUFFER_STORE_FORMAT,
+ TBUFFER_STORE_FORMAT_X3,
+ TBUFFER_LOAD_FORMAT,
ATOMIC_CMP_SWAP,
ATOMIC_INC,
ATOMIC_DEC,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index a01f5d37c7c16..69dc529861729 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -66,7 +66,9 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
enum SIEncodingFamily {
SI = 0,
- VI = 1
+ VI = 1,
+ SDWA = 2,
+ SDWA9 = 3
};
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
@@ -101,7 +103,12 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
}
int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
- int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST));
+ SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+ if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
+ Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
+ : SIEncodingFamily::SDWA;
+
+ int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
// -1 means that Opcode is already a native instruction.
if (MCOp == -1)
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e286558ce60d7..bcf89bb78ad66 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -70,6 +70,10 @@ def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
[SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
>;
+def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
+ [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
+>;
+
//===----------------------------------------------------------------------===//
// AMDGPU DAG Nodes
//
@@ -179,6 +183,12 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
// out = (src1 > src0) ? 1 : 0
def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
+// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own
+// nodes in TargetSelectionDAG.td.
+def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>;
+
+def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>;
+
def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
]>;
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 790a69b843979..cc56216c355bf 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -29,12 +29,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
using namespace TargetOpcode;
const LLT S1= LLT::scalar(1);
+ const LLT V2S16 = LLT::vector(2, 16);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
const LLT P1 = LLT::pointer(1, 64);
const LLT P2 = LLT::pointer(2, 64);
setAction({G_ADD, S32}, Legal);
+ setAction({G_AND, S32}, Legal);
+
+ setAction({G_BITCAST, V2S16}, Legal);
+ setAction({G_BITCAST, 1, S32}, Legal);
+
+ setAction({G_BITCAST, S32}, Legal);
+ setAction({G_BITCAST, 1, V2S16}, Legal);
// FIXME: i1 operands to intrinsics should always be legal, but other i1
// values may not be legal. We need to figure out how to distinguish
@@ -61,6 +69,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
setAction({G_SELECT, S32}, Legal);
setAction({G_SELECT, 1, S1}, Legal);
+ setAction({G_SHL, S32}, Legal);
+
setAction({G_STORE, S32}, Legal);
setAction({G_STORE, 1, P1}, Legal);
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 8d157e2f98f24..ab5abf2039a5b 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -124,6 +124,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasScalarStores(false),
HasInv2PiInlineImm(false),
HasSDWA(false),
+ HasSDWAOmod(false),
+ HasSDWAScalar(false),
+ HasSDWASdst(false),
+ HasSDWAMac(false),
+ HasSDWAClampVOPC(false),
HasDPP(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 5f4f20316a6ba..2b16289c723ef 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -149,6 +149,11 @@ protected:
bool HasScalarStores;
bool HasInv2PiInlineImm;
bool HasSDWA;
+ bool HasSDWAOmod;
+ bool HasSDWAScalar;
+ bool HasSDWASdst;
+ bool HasSDWAMac;
+ bool HasSDWAClampVOPC;
bool HasDPP;
bool FlatAddressSpace;
bool FlatInstOffsets;
@@ -431,6 +436,26 @@ public:
return HasSDWA;
}
+ bool hasSDWAOmod() const {
+ return HasSDWAOmod;
+ }
+
+ bool hasSDWAScalar() const {
+ return HasSDWAScalar;
+ }
+
+ bool hasSDWASdst() const {
+ return HasSDWASdst;
+ }
+
+ bool hasSDWAMac() const {
+ return HasSDWAMac;
+ }
+
+ bool hasSDWAClampVOPC() const {
+ return HasSDWAClampVOPC;
+ }
+
/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b644eba536fa4..04fe9f689806c 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -342,6 +342,14 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
PM.add(createAMDGPUExternalAAWrapperPass());
}
});
+
+ Builder.addExtension(
+ PassManagerBuilder::EP_CGSCCOptimizerLate,
+ [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ // Add infer address spaces pass to the opt pipeline after inlining
+ // but before SROA to increase SROA opportunities.
+ PM.add(createInferAddressSpacesPass());
+ });
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 0d6689bd04c4e..88245b01683a5 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -184,9 +184,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
}
}
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
- if (Vec)
- return 0;
+unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+ // The concept of vector registers doesn't really exist. Some packed vector
+ // operations operate on the normal 32-bit registers.
// Number of VGPRs on SI.
if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -195,8 +195,18 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
+ // This is really the number of registers to fill when vectorizing /
+ // interleaving loops, so we lie to avoid trying to use all registers.
+ return getHardwareNumberOfRegisters(Vec) >> 3;
+}
+
unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
- return Vector ? 0 : 32;
+ return 32;
+}
+
+unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
+ return 32;
}
unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
@@ -247,11 +257,11 @@ bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Disable unrolling if the loop is not vectorized.
+ // TODO: Enable this again.
if (VF == 1)
return 1;
- // Semi-arbitrary large amount.
- return 64;
+ return 8;
}
int AMDGPUTTIImpl::getArithmeticInstrCost(
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index a60b1bb1b59c7..485e20411ab49 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -75,8 +75,10 @@ public:
return TTI::PSK_FastHardware;
}
- unsigned getNumberOfRegisters(bool Vector);
- unsigned getRegisterBitWidth(bool Vector) const;
+ unsigned getHardwareNumberOfRegisters(bool Vector) const;
+ unsigned getNumberOfRegisters(bool Vector) const;
+ unsigned getRegisterBitWidth(bool Vector) const ;
+ unsigned getMinVectorRegisterBitWidth() const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 392e9d89bd9ba..7b8756050b752 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -152,6 +152,8 @@ public:
ImmTyExpTgt,
ImmTyExpCompr,
ImmTyExpVM,
+ ImmTyDFMT,
+ ImmTyNFMT,
ImmTyHwreg,
ImmTyOff,
ImmTySendMsg,
@@ -260,6 +262,8 @@ public:
return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
}
+ bool isSDWARegKind() const;
+
bool isImmTy(ImmTy ImmT) const {
return isImm() && Imm.Type == ImmT;
}
@@ -292,6 +296,8 @@ public:
bool isGLC() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
+ bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
+ bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -636,6 +642,8 @@ public:
case ImmTyGLC: OS << "GLC"; break;
case ImmTySLC: OS << "SLC"; break;
case ImmTyTFE: OS << "TFE"; break;
+ case ImmTyDFMT: OS << "DFMT"; break;
+ case ImmTyNFMT: OS << "NFMT"; break;
case ImmTyClampSI: OS << "ClampSI"; break;
case ImmTyOModSI: OS << "OModSI"; break;
case ImmTyDppCtrl: OS << "DppCtrl"; break;
@@ -993,7 +1001,9 @@ private:
void errorExpTgt();
OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
- bool validateOperandLimitations(const MCInst &Inst);
+ bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc);
+ bool validateConstantBusLimitations(const MCInst &Inst);
+ bool validateEarlyClobberLimitations(const MCInst &Inst);
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1029,6 +1039,8 @@ public:
void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+ void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
+
AMDGPUOperand::Ptr defaultGLC() const;
AMDGPUOperand::Ptr defaultSLC() const;
AMDGPUOperand::Ptr defaultTFE() const;
@@ -1042,6 +1054,7 @@ public:
AMDGPUOperand::Ptr defaultSMRDOffset20() const;
AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
AMDGPUOperand::Ptr defaultOffsetU12() const;
+ AMDGPUOperand::Ptr defaultOffsetS13() const;
OperandMatchResultTy parseOModOperand(OperandVector &Operands);
@@ -1243,6 +1256,15 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
}
+bool AMDGPUOperand::isSDWARegKind() const {
+ if (AsmParser->isVI())
+ return isVReg();
+ else if (AsmParser->isGFX9())
+ return isRegKind();
+ else
+ return false;
+}
+
uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
{
assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
@@ -2083,7 +2105,7 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());
}
-bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
+bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
const unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opcode);
unsigned ConstantBusUseCount = 0;
@@ -2137,6 +2159,60 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
return ConstantBusUseCount <= 1;
}
+bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
+
+ const unsigned Opcode = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opcode);
+
+ const int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+ if (DstIdx == -1 ||
+ Desc.getOperandConstraint(DstIdx, MCOI::EARLY_CLOBBER) == -1) {
+ return true;
+ }
+
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+ const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+ const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+ assert(DstIdx != -1);
+ const MCOperand &Dst = Inst.getOperand(DstIdx);
+ assert(Dst.isReg());
+ const unsigned DstReg = mc2PseudoReg(Dst.getReg());
+
+ const int SrcIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+ for (int SrcIdx : SrcIndices) {
+ if (SrcIdx == -1) break;
+ const MCOperand &Src = Inst.getOperand(SrcIdx);
+ if (Src.isReg()) {
+ const unsigned SrcReg = mc2PseudoReg(Src.getReg());
+ if (isRegIntersect(DstReg, SrcReg, TRI)) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
+ const SMLoc &IDLoc) {
+ if (!validateConstantBusLimitations(Inst)) {
+ Error(IDLoc,
+ "invalid operand (violates constant bus restrictions)");
+ return false;
+ }
+ if (!validateEarlyClobberLimitations(Inst)) {
+ Error(IDLoc,
+ "destination must be different than all sources");
+ return false;
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
@@ -2169,9 +2245,8 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
switch (Result) {
default: break;
case Match_Success:
- if (!validateOperandLimitations(Inst)) {
- return Error(IDLoc,
- "invalid operand (violates constant bus restrictions)");
+ if (!validateInstruction(Inst, IDLoc)) {
+ return true;
}
Inst.setLoc(IDLoc);
Out.EmitInstruction(Inst, getSTI());
@@ -2554,11 +2629,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
return MatchOperand_ParseFail;
Parser.Lex();
+
+ bool IsMinus = false;
+ if (getLexer().getKind() == AsmToken::Minus) {
+ Parser.Lex();
+ IsMinus = true;
+ }
+
if (getLexer().isNot(AsmToken::Integer))
return MatchOperand_ParseFail;
if (getParser().parseAbsoluteExpression(Int))
return MatchOperand_ParseFail;
+
+ if (IsMinus)
+ Int = -Int;
break;
}
}
@@ -3743,6 +3828,44 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
}
+void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
+ OptionalImmIndexMap OptionalIdx;
+
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+ // Add the register arguments
+ if (Op.isReg()) {
+ Op.addRegOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle the case where soffset is an immediate
+ if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+ Op.addImmOperands(Inst, 1);
+ continue;
+ }
+
+ // Handle tokens like 'offen' which are sometimes hard-coded into the
+ // asm string. There are no MCInst operands for these.
+ if (Op.isToken()) {
+ continue;
+ }
+ assert(Op.isImm());
+
+ // Handle optional arguments
+ OptionalIdx[Op.getImmTy()] = i;
+ }
+
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyOffset);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+}
+
//===----------------------------------------------------------------------===//
// mimg
//===----------------------------------------------------------------------===//
@@ -3870,6 +3993,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const {
+ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
//===----------------------------------------------------------------------===//
// vop3
//===----------------------------------------------------------------------===//
@@ -3919,6 +4046,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
{"gds", AMDGPUOperand::ImmTyGDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
+ {"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr},
+ {"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
@@ -4475,12 +4604,11 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
- // V_NOP_sdwa_vi has no optional sdwa arguments
+ // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
switch (BasicInstType) {
case SIInstrFlags::VOP1:
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
- if (isGFX9() &&
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
@@ -4490,8 +4618,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
case SIInstrFlags::VOP2:
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
- if (isGFX9() &&
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
}
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
@@ -4501,9 +4628,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
break;
case SIInstrFlags::VOPC:
- if (isVI()) {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
- }
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
break;
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 2aca65ac84303..2e96c14eaa320 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -57,6 +57,11 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
string OpName = NAME # suffix;
}
+class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+ bit IsAddr64 = is_addr64;
+ string OpName = NAME # suffix;
+}
+
//===----------------------------------------------------------------------===//
// MTBUF classes
//===----------------------------------------------------------------------===//
@@ -78,14 +83,31 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
let EXP_CNT = 1;
let MTBUF = 1;
let Uses = [EXEC];
-
let hasSideEffects = 0;
let SchedRW = [WriteVMEM];
+
+ let AsmMatchConverter = "cvtMtbuf";
+
+ bits<1> offen = 0;
+ bits<1> idxen = 0;
+ bits<1> addr64 = 0;
+ bits<1> has_vdata = 1;
+ bits<1> has_vaddr = 1;
+ bits<1> has_glc = 1;
+ bits<1> glc_value = 0; // the value for glc if no such operand
+ bits<4> dfmt_value = 1; // the value for dfmt if no such operand
+ bits<3> nfmt_value = 0; // the value for nfmt if no such operand
+ bits<1> has_srsrc = 1;
+ bits<1> has_soffset = 1;
+ bits<1> has_offset = 1;
+ bits<1> has_slc = 1;
+ bits<1> has_tfe = 1;
+ bits<1> has_dfmt = 1;
+ bits<1> has_nfmt = 1;
}
class MTBUF_Real <MTBUF_Pseudo ps> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
- Enc64 {
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -97,57 +119,168 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
- bits<8> vdata;
bits<12> offset;
- bits<1> offen;
- bits<1> idxen;
- bits<1> glc;
- bits<1> addr64;
- bits<4> dfmt;
- bits<3> nfmt;
- bits<8> vaddr;
- bits<7> srsrc;
- bits<1> slc;
- bits<1> tfe;
- bits<8> soffset;
-
- let Inst{11-0} = offset;
- let Inst{12} = offen;
- let Inst{13} = idxen;
- let Inst{14} = glc;
- let Inst{22-19} = dfmt;
- let Inst{25-23} = nfmt;
- let Inst{31-26} = 0x3a; //encoding
- let Inst{39-32} = vaddr;
- let Inst{47-40} = vdata;
- let Inst{52-48} = srsrc{6-2};
- let Inst{54} = slc;
- let Inst{55} = tfe;
- let Inst{63-56} = soffset;
+ bits<1> glc;
+ bits<4> dfmt;
+ bits<3> nfmt;
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+}
+
+class getMTBUFInsDA<list<RegisterClass> vdataList,
+ list<RegisterClass> vaddrList=[]> {
+ RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+ RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+ dag InsNoData = !if(!empty(vaddrList),
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset,
+ offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+ (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+ offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+ );
+ dag InsData = !if(!empty(vaddrList),
+ (ins vdataClass:$vdata, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+ slc:$slc, tfe:$tfe),
+ (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+ SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+ slc:$slc, tfe:$tfe)
+ );
+ dag ret = !if(!empty(vdataList), InsNoData, InsData);
}
-class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
- opName, (outs regClass:$dst),
- (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
- i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
- i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
- " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
- " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+ dag ret =
+ !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+ (ins))))));
+}
+
+class getMTBUFAsmOps<int addrKind> {
+ string Pfx =
+ !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
+ !if(!eq(addrKind, BUFAddrKind.OffEn),
+ "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
+ !if(!eq(addrKind, BUFAddrKind.IdxEn),
+ "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
+ !if(!eq(addrKind, BUFAddrKind.BothEn),
+ "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
+ !if(!eq(addrKind, BUFAddrKind.Addr64),
+ "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
+ "")))));
+ string ret = Pfx # "$offset";
+}
+
+class MTBUF_SetupAddr<int addrKind> {
+ bits<1> offen = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+ bits<1> idxen = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+ bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+ bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+}
+
+class MTBUF_Load_Pseudo <string opName,
+ int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind>
+ : MTBUF_Pseudo<opName,
+ (outs vdataClass:$vdata),
+ getMTBUFIns<addrKindCopy>.ret,
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ pattern>,
+ MTBUF_SetupAddr<addrKindCopy> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 1;
let mayStore = 0;
}
-class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
- opName, (outs),
- (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
- i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
- SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
- " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
- " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+ ValueType load_vt = i32,
+ SDPatternOperator ld = null_frag> {
+
+ def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(set load_vt:$vdata,
+ (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
+ i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ MTBUFAddr64Table<0>;
+
+ def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(set load_vt:$vdata,
+ (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
+ i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ MTBUFAddr64Table<1>;
+
+ def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+ let DisableWQM = 1 in {
+ def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+ def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ }
+}
+
+class MTBUF_Store_Pseudo <string opName,
+ int addrKind,
+ RegisterClass vdataClass,
+ list<dag> pattern=[],
+ // Workaround bug bz30254
+ int addrKindCopy = addrKind,
+ RegisterClass vdataClassCopy = vdataClass>
+ : MTBUF_Pseudo<opName,
+ (outs),
+ getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+ pattern>,
+ MTBUF_SetupAddr<addrKindCopy> {
+ let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 0;
let mayStore = 1;
}
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+ ValueType store_vt = i32,
+ SDPatternOperator st = null_frag> {
+
+ def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+ i1:$slc, i1:$tfe))]>,
+ MTBUFAddr64Table<0>;
+
+ def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+ i1:$slc, i1:$tfe))]>,
+ MTBUFAddr64Table<1>;
+
+ def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+ let DisableWQM = 1 in {
+ def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+ def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ }
+}
+
+
//===----------------------------------------------------------------------===//
// MUBUF classes
//===----------------------------------------------------------------------===//
@@ -676,14 +809,14 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
// MTBUF Instructions
//===----------------------------------------------------------------------===//
-//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <1, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
-def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Pseudo <"tbuffer_load_format_xyzw", VReg_128>;
-def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
-def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
-def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
-def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
} // End let SubtargetPredicate = isGCN
@@ -1093,22 +1226,98 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OF
// MTBUF Patterns
//===----------------------------------------------------------------------===//
-// TBUFFER_STORE_FORMAT_*, addr64=0
-class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
- (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
- i32:$soffset, imm:$inst_offset, imm:$dfmt,
- imm:$nfmt, imm:$offen, imm:$idxen,
- imm:$glc, imm:$slc, imm:$tfe),
- (opcode
- $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
- (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
- (as_i1imm $slc), (as_i1imm $tfe), $soffset)
->;
+//===----------------------------------------------------------------------===//
+// tbuffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ def : Pat<
+ (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+ (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+ (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+ (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
-def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
-def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
-def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
-def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+ def : Pat<
+ (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset),
+ (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+}
+
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32, "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
+
+multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i8imm $dfmt),
+ (as_i8imm $nfmt), (as_i1imm $glc),
+ (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i8imm $dfmt),
+ (as_i8imm $nfmt), (as_i1imm $glc),
+ (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+ (as_i16imm $offset), (as_i8imm $dfmt),
+ (as_i8imm $nfmt), (as_i1imm $glc),
+ (as_i1imm $slc), 0)
+ >;
+
+ def : Pat<
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+ imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
+ $vdata,
+ (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+ $rsrc, $soffset, (as_i16imm $offset),
+ (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ >;
+}
+
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32, "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32, "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
} // End let Predicates = [isGCN]
@@ -1224,21 +1433,44 @@ def BUFFER_WBINVL1_si : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
MTBUF_Real<ps>,
+ Enc64,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
let AssemblerPredicate=isSICI;
let DecoderNamespace="SICI";
- bits<1> addr64;
- let Inst{15} = addr64;
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
+ let Inst{15} = ps.addr64;
let Inst{18-16} = op;
+ let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+ let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+ let Inst{31-26} = 0x3a; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
-def TBUFFER_LOAD_FORMAT_XYZW_si : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_si : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_si : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_si : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
+ def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
+ def _OFFEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_si <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
//===----------------------------------------------------------------------===//
// CI
@@ -1350,16 +1582,39 @@ def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
MTBUF_Real<ps>,
+ Enc64,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
let AssemblerPredicate=isVI;
let DecoderNamespace="VI";
+ let Inst{11-0} = !if(ps.has_offset, offset, ?);
+ let Inst{12} = ps.offen;
+ let Inst{13} = ps.idxen;
+ let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
let Inst{18-15} = op;
+ let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+ let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+ let Inst{31-26} = 0x3a; //encoding
+ let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+ let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+ let Inst{54} = !if(ps.has_slc, slc, ?);
+ let Inst{55} = !if(ps.has_tfe, tfe, ?);
+ let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
}
-def TBUFFER_LOAD_FORMAT_XYZW_vi : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_vi : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_vi : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_vi : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
+ def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+ def _OFFEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+ def _IDXEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+ def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 88c92b9582fd0..04308fb3aaf64 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -49,6 +49,17 @@ addOperand(MCInst &Inst, const MCOperand& Opnd) {
MCDisassembler::SoftFail;
}
+static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
+ uint16_t NameIdx) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx);
+ if (OpIdx != -1) {
+ auto I = MI.begin();
+ std::advance(I, OpIdx);
+ MI.insert(I, Op);
+ }
+ return OpIdx;
+}
+
static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
uint64_t Addr, const void *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
@@ -106,12 +117,12 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
}
-#define DECODE_SDWA9(DecName) \
-DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName)
+#define DECODE_SDWA(DecName) \
+DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
-DECODE_SDWA9(Src32)
-DECODE_SDWA9(Src16)
-DECODE_SDWA9(VopcDst)
+DECODE_SDWA(Src32)
+DECODE_SDWA(Src16)
+DECODE_SDWA(VopcDst)
#include "AMDGPUGenDisassemblerTables.inc"
@@ -149,6 +160,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
raw_ostream &WS,
raw_ostream &CS) const {
CommentStream = &CS;
+ bool IsSDWA = false;
// ToDo: AMDGPUDisassembler supports only VI ISA.
if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding])
@@ -170,10 +182,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Res) break;
Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
- if (Res) break;
+ if (Res) { IsSDWA = true; break; }
Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
- if (Res) break;
+ if (Res) { IsSDWA = true; break; }
}
// Reinitialize Bytes as DPP64 could have eaten too much
@@ -200,17 +212,36 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
// Insert dummy unused src2_modifiers.
- int Src2ModIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::src2_modifiers);
- auto I = MI.begin();
- std::advance(I, Src2ModIdx);
- MI.insert(I, MCOperand::createImm(0));
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src2_modifiers);
}
+ if (Res && IsSDWA)
+ Res = convertSDWAInst(MI);
+
Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
return Res;
}
+DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+ if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1)
+ // VOPC - insert clamp
+ insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
+ } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+ int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
+ if (SDst != -1) {
+ // VOPC - insert VCC register as sdst
+ insertNamedMCOperand(MI, MCOperand::createReg(AMDGPU::VCC),
+ AMDGPU::OpName::sdst);
+ } else {
+ // VOP1/2 - insert omod if present in instruction
+ insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
+ }
+ }
+ return MCDisassembler::Success;
+}
+
const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
return getContext().getRegisterInfo()->
getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
@@ -524,8 +555,6 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
}
- assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
-
if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
return decodeIntImmed(Val);
@@ -592,36 +621,43 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
return errOperand(Val, "unknown operand encoding " + Twine(Val));
}
-MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width,
- unsigned Val) const {
+MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
+ unsigned Val) const {
using namespace AMDGPU::SDWA;
- if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
- Val <= SDWA9EncValues::SRC_VGPR_MAX) {
- return createRegOperand(getVgprClassId(Width),
- Val - SDWA9EncValues::SRC_VGPR_MIN);
- }
- if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
- Val <= SDWA9EncValues::SRC_SGPR_MAX) {
- return createSRegOperand(getSgprClassId(Width),
- Val - SDWA9EncValues::SRC_SGPR_MIN);
- }
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+ if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
+ Val <= SDWA9EncValues::SRC_VGPR_MAX) {
+ return createRegOperand(getVgprClassId(Width),
+ Val - SDWA9EncValues::SRC_VGPR_MIN);
+ }
+ if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
+ Val <= SDWA9EncValues::SRC_SGPR_MAX) {
+ return createSRegOperand(getSgprClassId(Width),
+ Val - SDWA9EncValues::SRC_SGPR_MIN);
+ }
- return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+ return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+ } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+ return createRegOperand(getVgprClassId(Width), Val);
+ }
+ llvm_unreachable("unsupported target");
}
-MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const {
- return decodeSDWA9Src(OPW16, Val);
+MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
+ return decodeSDWASrc(OPW16, Val);
}
-MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const {
- return decodeSDWA9Src(OPW32, Val);
+MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
+ return decodeSDWASrc(OPW32, Val);
}
-MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const {
+MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
using namespace AMDGPU::SDWA;
+ assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] &&
+ "SDWAVopcDst should be present only on GFX9");
if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
if (Val > AMDGPU::EncValues::SGPR_MAX) {
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5fa3cf1a223fa..3d71db909e20d 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -65,6 +65,8 @@ public:
uint64_t Inst,
uint64_t Address) const;
+ DecodeStatus convertSDWAInst(MCInst &MI) const;
+
MCOperand decodeOperand_VGPR_32(unsigned Val) const;
MCOperand decodeOperand_VS_32(unsigned Val) const;
MCOperand decodeOperand_VS_64(unsigned Val) const;
@@ -105,10 +107,10 @@ public:
MCOperand decodeSpecialReg32(unsigned Val) const;
MCOperand decodeSpecialReg64(unsigned Val) const;
- MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const;
- MCOperand decodeSDWA9Src16(unsigned Val) const;
- MCOperand decodeSDWA9Src32(unsigned Val) const;
- MCOperand decodeSDWA9VopcDst(unsigned Val) const;
+ MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val) const;
+ MCOperand decodeSDWASrc16(unsigned Val) const;
+ MCOperand decodeSDWASrc32(unsigned Val) const;
+ MCOperand decodeSDWAVopcDst(unsigned Val) const;
};
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 98eda288bcacb..edca6fcd812c8 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -31,8 +31,6 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
let VM_CNT = 1;
let LGKM_CNT = 1;
- let Uses = [EXEC, FLAT_SCR]; // M0
-
let UseNamedOperandTable = 1;
let hasSideEffects = 0;
let SchedRW = [WriteVMEM];
@@ -40,10 +38,16 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
string Mnemonic = opName;
string AsmOperands = asmOps;
+ bits<1> is_flat_global = 0;
+ bits<1> is_flat_scratch = 0;
+
bits<1> has_vdst = 1;
bits<1> has_data = 1;
bits<1> has_glc = 1;
bits<1> glcValue = 0;
+
+ // TODO: M0 if it could possibly access LDS (before gfx9? only)?
+ let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]);
}
class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -68,7 +72,10 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
// Only valid on gfx9
bits<1> lds = 0; // XXX - What does this actually do?
- bits<2> seg; // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+
+ // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+ bits<2> seg = !if(ps.is_flat_global, 0b10,
+ !if(ps.is_flat_scratch, 0b01, 0));
// Signed offset. Highest bit ignored for flat and treated as 12-bit
// unsigned for flat acceses.
@@ -81,7 +88,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
// Only valid on GFX9+
let Inst{12-0} = offset;
let Inst{13} = lds;
- let Inst{15-14} = 0;
+ let Inst{15-14} = seg;
let Inst{16} = !if(ps.has_glc, glc, ps.glcValue);
let Inst{17} = slc;
@@ -106,6 +113,16 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
let mayLoad = 1;
}
+class FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> :
+ FLAT_Load_Pseudo<opName, regClass, 1> {
+ let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> :
+ FLAT_Load_Pseudo<opName, regClass, 1> {
+ let is_flat_scratch = 1;
+}
+
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
bit HasSignedOffset = 0> : FLAT_Pseudo<
opName,
@@ -119,6 +136,16 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
let has_vdst = 0;
}
+class FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> :
+ FLAT_Store_Pseudo<opName, regClass, 1> {
+ let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> :
+ FLAT_Store_Pseudo<opName, regClass, 1> {
+ let is_flat_scratch = 1;
+}
+
multiclass FLAT_Atomic_Pseudo<
string opName,
RegisterClass vdst_rc,
@@ -306,6 +333,26 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
} // End SubtargetPredicate = isCI
+let SubtargetPredicate = HasFlatGlobalInsts in {
+def GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
+def GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
+def GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
+def GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>;
+def GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>;
+def GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>;
+def GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
+def GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
+
+def GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
+def GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
+def GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>;
+def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
+def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
+def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+
+} // End SubtargetPredicate = HasFlatGlobalInsts
+
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -557,3 +604,18 @@ defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
+def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>;
+def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>;
+def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>;
+def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>;
+def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>;
+def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>;
+def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>;
+def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>;
+
+def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>;
+def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>;
+def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>;
+def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>;
+def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>;
+def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>;
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index b84640230eeeb..7c31c8e397ba7 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -72,6 +72,11 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
}
+void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatDec(static_cast<int16_t>(MI->getOperand(OpNo).getImm()));
+}
+
void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -118,6 +123,16 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
}
}
+void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint16_t Imm = MI->getOperand(OpNo).getImm();
+ if (Imm != 0) {
+ O << ((OpNo == 0)? "offset:" : " offset:");
+ printS16ImmDecOperand(MI, OpNo, O);
+ }
+}
+
void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -216,6 +231,24 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
O << " vm";
}
+void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm()) {
+ O << " dfmt:";
+ printU8ImmDecOperand(MI, OpNo, O);
+ }
+}
+
+void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm()) {
+ O << " nfmt:";
+ printU8ImmDecOperand(MI, OpNo, O);
+ }
+}
+
void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
const MCRegisterInfo &MRI) {
switch (RegNo) {
@@ -379,7 +412,6 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
uint16_t Lo16 = static_cast<uint16_t>(Imm);
- assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
printImmediate16(Lo16, STI, O);
}
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index c8094c4b840a1..7bbf99a85f409 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -42,6 +42,7 @@ private:
void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
@@ -52,6 +53,9 @@ private:
void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
+
void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -84,6 +88,10 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpVM(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printDFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printNFMT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printRegOperand(unsigned RegNo, raw_ostream &O);
void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 0a9c2b94c1eee..2b408ff10caae 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -30,14 +30,9 @@ public:
unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
- void processFixupValue(const MCAssembler &Asm,
- const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) override;
-
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override {
@@ -102,36 +97,11 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
}
}
-void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm,
- const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) {
- MCValue Res;
-
- // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are
- // used for long branches, this function will be called with
- // IsResolved = false and Value set to some pre-computed value. In
- // the example above, the value would be:
- // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction.
- // This is not what we want. We just want the expression computation
- // only. The reason the MC layer subtracts the current offset from the
- // expression is because the fixup is of kind FK_PCRel_4.
- // For these scenarios, evaluateAsValue gives us the computation that we
- // want.
- if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) &&
- Res.isAbsolute()) {
- Value = Res.getConstant();
- IsResolved = true;
-
- }
- if (IsResolved)
- Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
-}
-
-void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value,
- bool IsPCRel, MCContext &Ctx) const {
+void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsPCRel) const {
+ Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
if (!Value)
return; // Doesn't change encoding.
@@ -142,7 +112,7 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
uint32_t Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+ assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
// For each byte of the fragment that the fixup touches, mask in the bits from
// the fixup value.
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index a856b17a228f0..1b062064ace1c 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -52,15 +52,15 @@ public:
return 0;
}
- virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+ virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
return 0;
}
- virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+ virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
return 0;
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index e02acf516c0db..376c9bfe5ccf2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -70,13 +70,13 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
- unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
- unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
};
} // end anonymous namespace
@@ -252,9 +252,7 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
uint16_t Lo16 = static_cast<uint16_t>(Imm);
- assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
uint32_t Encoding = getLit16Encoding(Lo16, STI);
- assert(Encoding != 255 && "packed constants can only be inline immediates");
return Encoding;
}
default:
@@ -328,11 +326,11 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
}
unsigned
-SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
using namespace AMDGPU::SDWA;
-
+
uint64_t RegEnc = 0;
const MCOperand &MO = MI.getOperand(OpNo);
@@ -347,9 +345,9 @@ SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
}
unsigned
-SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
using namespace AMDGPU::SDWA;
uint64_t RegEnc = 0;
@@ -365,6 +363,25 @@ SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
return RegEnc;
}
+static bool needsPCRel(const MCExpr *Expr) {
+ switch (Expr->getKind()) {
+ case MCExpr::SymbolRef:
+ return true;
+ case MCExpr::Binary: {
+ auto *BE = cast<MCBinaryExpr>(Expr);
+ if (BE->getOpcode() == MCBinaryExpr::Sub)
+ return false;
+ return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS());
+ }
+ case MCExpr::Unary:
+ return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr());
+ case MCExpr::Target:
+ case MCExpr::Constant:
+ return false;
+ }
+ llvm_unreachable("invalid kind");
+}
+
uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
const MCOperand &MO,
SmallVectorImpl<MCFixup> &Fixups,
@@ -373,12 +390,21 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
return MRI.getEncodingValue(MO.getReg());
if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
- const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+ // FIXME: If this is expression is PCRel or not should not depend on what
+ // the expression looks like. Given that this is just a general expression,
+ // it should probably be FK_Data_4 and whatever is producing
+ //
+ // s_add_u32 s2, s2, (extern_const_addrspace+16
+ //
+ // And expecting a PCRel should instead produce
+ //
+ // .Ltmp1:
+ // s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1
MCFixupKind Kind;
- if (Expr && Expr->getSymbol().isExternal())
- Kind = FK_Data_4;
- else
+ if (needsPCRel(MO.getExpr()))
Kind = FK_PCRel_4;
+ else
+ Kind = FK_Data_4;
Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
}
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index f6f2582aa11b3..d30d1d382588c 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -80,7 +80,7 @@ def : Proc<"cayman", R600_VLIW4_Itin,
// Southern Islands
//===----------------------------------------------------------------------===//
-def : ProcessorModel<"gfx600", SIFullSpeedModel,
+def : ProcessorModel<"gfx600", SIFullSpeedModel,
[FeatureISAVersion6_0_0]>;
def : ProcessorModel<"SI", SIFullSpeedModel,
@@ -95,7 +95,7 @@ def : ProcessorModel<"gfx601", SIQuarterSpeedModel,
[FeatureISAVersion6_0_1]
>;
-def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel,
[FeatureISAVersion6_0_1]>;
def : ProcessorModel<"verde", SIQuarterSpeedModel,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index c55878f8bff0f..215791f4f92dd 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -584,23 +584,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
return LowerImplicitParameter(DAG, VT, DL, 8);
case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_X, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T1_X, VT);
case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_Y, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T1_Y, VT);
case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T1_Z, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T1_Z, VT);
case Intrinsic::r600_read_tidig_x:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_X, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T0_X, VT);
case Intrinsic::r600_read_tidig_y:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_Y, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T0_Y, VT);
case Intrinsic::r600_read_tidig_z:
- return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
- AMDGPU::T0_Z, VT);
+ return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+ AMDGPU::T0_Z, VT);
case Intrinsic::r600_recipsqrt_ieee:
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 5cd90323ff67b..3915c0e5bdbed 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -118,9 +118,9 @@ namespace AMDGPU {
// Operand for source modifiers for VOP instructions
OPERAND_INPUT_MODS,
- // Operand for GFX9 SDWA instructions
- OPERAND_SDWA9_SRC,
- OPERAND_SDWA9_VOPC_DST,
+ // Operand for SDWA instructions
+ OPERAND_SDWA_SRC,
+ OPERAND_SDWA_VOPC_DST,
/// Operand with 32-bit immediate that uses the constant bus.
OPERAND_KIMM32,
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 5f5f25103c027..0a795c99f94e5 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -174,6 +174,31 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
}
+static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
+ const SIRegisterInfo *TRI,
+ const SIInstrInfo *TII) {
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ auto &Src = MI.getOperand(1);
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = Src.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+ !TargetRegisterInfo::isVirtualRegister(DstReg))
+ return false;
+
+ for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
+ const auto *UseMI = MO.getParent();
+ if (UseMI == &MI)
+ continue;
+ if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
+ UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
+ !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
+ return false;
+ }
+ // Change VGPR to SGPR destination.
+ MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
+ return true;
+}
+
// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
//
// SGPRx = ...
@@ -214,6 +239,9 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
return false;
+ if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
+ return true;
+
// TODO: Could have multiple extracts?
unsigned SubReg = CopyUse.getOperand(1).getSubReg();
if (SubReg != AMDGPU::NoSubRegister)
@@ -563,6 +591,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
}
TII->moveToVALU(MI);
+ } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+ tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
}
break;
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index e10f1ed3762e8..f391f67a241f1 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,6 +13,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -166,6 +167,8 @@ static bool updateOperand(FoldCandidate &Fold,
if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
TargetRegisterInfo::isVirtualRegister(New->getReg())) {
Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+
+ Old.setIsUndef(New->isUndef());
return true;
}
@@ -470,7 +473,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
return &Op;
MachineInstr *Def = MRI.getVRegDef(Op.getReg());
- if (Def->isMoveImmediate()) {
+ if (Def && Def->isMoveImmediate()) {
MachineOperand &ImmSrc = Def->getOperand(1);
if (ImmSrc.isImm())
return &ImmSrc;
@@ -921,12 +924,9 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
// level.
bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
-
- MachineBasicBlock &MBB = *BI;
+ for (MachineBasicBlock *MBB : depth_first(&MF)) {
MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ for (I = MBB->begin(); I != MBB->end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index b1bd14e421f02..08a64de385018 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -284,7 +284,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
- if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
+ if (ST.isAmdCodeObjectV2(MF)) {
PreloadedPrivateBufferReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}
@@ -363,14 +363,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// Use relocations to get the pointer, and setup the other bits manually.
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
- if (MFI->hasPrivateMemoryInputPtr()) {
+ if (MFI->hasImplicitBufferPtr()) {
unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
BuildMI(MBB, I, DL, Mov64, Rsrc01)
- .addReg(PreloadedPrivateBufferReg)
+ .addReg(MFI->getImplicitBufferPtrUserSGPR())
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
} else {
const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
@@ -385,7 +385,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MachineMemOperand::MODereferenceable,
0, 0);
BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
- .addReg(PreloadedPrivateBufferReg)
+ .addReg(MFI->getImplicitBufferPtrUserSGPR())
.addImm(0) // offset
.addImm(0) // glc
.addMemOperand(MMO)
@@ -417,14 +417,69 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- if (MFI->isEntryFunction())
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ if (FuncInfo->isEntryFunction()) {
emitEntryFunctionPrologue(MF, MBB);
+ return;
+ }
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+ unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ DebugLoc DL;
+
+ bool NeedFP = hasFP(MF);
+ if (NeedFP) {
+ // If we need a base pointer, set it up here. It's whatever the value of
+ // the stack pointer is at this point. Any variable size objects will be
+ // allocated after this, so we can still use the base pointer to reference
+ // locals.
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
+ .addReg(StackPtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ uint32_t NumBytes = MFI.getStackSize();
+ if (NumBytes != 0 && hasSP(MF)) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
+ .addReg(StackPtrReg)
+ .addImm(NumBytes * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ if (FuncInfo->isEntryFunction())
+ return;
+ unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+ if (StackPtrReg == AMDGPU::NoRegister)
+ return;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ uint32_t NumBytes = MFI.getStackSize();
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc DL;
+
+ // FIXME: Clarify distinction between no set SP and SP. For callee functions,
+ // it's really whether we need SP to be accurate or not.
+
+ if (NumBytes != 0 && hasSP(MF)) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
+ .addReg(StackPtrReg)
+ .addImm(NumBytes * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
}
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
@@ -557,3 +612,19 @@ void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
}
}
+
+bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
+ // All stack operations are relative to the frame offset SGPR.
+ // TODO: Still want to eliminate sometimes.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // XXX - Is this only called after frame is finalized? Should be able to check
+ // frame size.
+ return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI);
+}
+
+bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+ // All stack operations are relative to the frame offset SGPR.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ return MFI.hasCalls() || MFI.hasVarSizedObjects();
+}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index e17adbe273614..d4dfa1c7eaa86 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -60,6 +60,10 @@ private:
/// \brief Emits debugger prologue.
void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+public:
+ bool hasFP(const MachineFunction &MF) const override;
+ bool hasSP(const MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 441f1ef4bd04c..d0f4e00994de1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -211,6 +211,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UADDO, MVT::i32, Legal);
setOperationAction(ISD::USUBO, MVT::i32, Legal);
+ setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
+ setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
@@ -471,6 +474,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, VT, Custom);
}
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::ADDCARRY);
+ setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::SUBCARRY);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
@@ -1061,10 +1068,10 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
- if (Info.hasPrivateMemoryInputPtr()) {
- unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI);
- MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass);
- CCInfo.AllocateReg(PrivateMemoryPtrReg);
+ if (Info.hasImplicitBufferPtr()) {
+ unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
+ MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
+ CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -1227,7 +1234,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
}
}
- if (NeedSP){
+ if (NeedSP) {
unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
@@ -2998,7 +3005,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntrinsicID) {
case Intrinsic::amdgcn_implicit_buffer_ptr: {
- unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ if (getSubtarget()->isAmdCodeObjectV2(MF))
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+ unsigned Reg = TRI->getPreloadedValue(MF,
+ SIRegisterInfo::IMPLICIT_BUFFER_PTR);
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
}
case Intrinsic::amdgcn_dispatch_ptr:
@@ -3288,6 +3299,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
SDLoc DL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
case Intrinsic::amdgcn_atomic_dec: {
@@ -3313,7 +3326,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // glc
Op.getOperand(6) // slc
};
- MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
@@ -3328,6 +3340,29 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
}
+ case Intrinsic::amdgcn_tbuffer_load: {
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Op.getOperand(4), // voffset
+ Op.getOperand(5), // soffset
+ Op.getOperand(6), // offset
+ Op.getOperand(7), // dfmt
+ Op.getOperand(8), // nfmt
+ Op.getOperand(9), // glc
+ Op.getOperand(10) // slc
+ };
+
+ EVT VT = Op.getOperand(2).getValueType();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad,
+ VT.getStoreSize(), VT.getStoreSize());
+ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
// Basic sample.
case Intrinsic::amdgcn_image_sample:
case Intrinsic::amdgcn_image_sample_cl:
@@ -3393,10 +3428,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ MachineFunction &MF = DAG.getMachineFunction();
switch (IntrinsicID) {
case Intrinsic::amdgcn_exp: {
@@ -3463,33 +3498,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
Op.getOperand(2), Op.getOperand(3));
}
- case AMDGPUIntrinsic::SI_tbuffer_store: {
- SDValue Ops[] = {
- Chain,
- Op.getOperand(2),
- Op.getOperand(3),
- Op.getOperand(4),
- Op.getOperand(5),
- Op.getOperand(6),
- Op.getOperand(7),
- Op.getOperand(8),
- Op.getOperand(9),
- Op.getOperand(10),
- Op.getOperand(11),
- Op.getOperand(12),
- Op.getOperand(13),
- Op.getOperand(14)
- };
-
- EVT VT = Op.getOperand(3).getValueType();
-
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOStore,
- VT.getStoreSize(), 4);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
- Op->getVTList(), Ops, VT, MMO);
- }
case AMDGPUIntrinsic::AMDGPU_kill: {
SDValue Src = Op.getOperand(2);
if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3505,7 +3513,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
- const MachineFunction &MF = DAG.getMachineFunction();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize())
@@ -3514,6 +3521,75 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
return SDValue();
};
+ case AMDGPUIntrinsic::SI_tbuffer_store: {
+
+ // Extract vindex and voffset from vaddr as appropriate
+ const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
+ const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
+ SDValue VAddr = Op.getOperand(5);
+
+ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+
+ assert(!(OffEn->isOne() && IdxEn->isOne()) &&
+ "Legacy intrinsic doesn't support both offset and index - use new version");
+
+ SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
+ SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
+
+ // Deal with the vec-3 case
+ const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
+ auto Opcode = NumChannels->getZExtValue() == 3 ?
+ AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
+
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(3), // vdata
+ Op.getOperand(2), // rsrc
+ VIndex,
+ VOffset,
+ Op.getOperand(6), // soffset
+ Op.getOperand(7), // inst_offset
+ Op.getOperand(8), // dfmt
+ Op.getOperand(9), // nfmt
+ Op.getOperand(12), // glc
+ Op.getOperand(13), // slc
+ };
+
+ assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
+ "Value of tfe other than zero is unsupported");
+
+ EVT VT = Op.getOperand(3).getValueType();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOStore,
+ VT.getStoreSize(), 4);
+ return DAG.getMemIntrinsicNode(Opcode, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
+
+ case Intrinsic::amdgcn_tbuffer_store: {
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(2), // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Op.getOperand(5), // voffset
+ Op.getOperand(6), // soffset
+ Op.getOperand(7), // offset
+ Op.getOperand(8), // dfmt
+ Op.getOperand(9), // nfmt
+ Op.getOperand(10), // glc
+ Op.getOperand(11) // slc
+ };
+ EVT VT = Op.getOperand(3).getValueType();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOStore,
+ VT.getStoreSize(), 4);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
+
default:
return Op;
}
@@ -4839,6 +4915,103 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return 0;
}
+SDValue SITargetLowering::performAddCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i32)
+ return SDValue();
+
+ SDLoc SL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // add x, zext (setcc) => addcarry x, 0, setcc
+ // add x, sext (setcc) => subcarry x, 0, setcc
+ unsigned Opc = LHS.getOpcode();
+ if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
+ Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
+ std::swap(RHS, LHS);
+
+ Opc = RHS.getOpcode();
+ switch (Opc) {
+ default: break;
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ANY_EXTEND: {
+ auto Cond = RHS.getOperand(0);
+ if (Cond.getOpcode() != ISD::SETCC &&
+ Cond.getOpcode() != AMDGPUISD::FP_CLASS)
+ break;
+ SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
+ SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
+ Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
+ return DAG.getNode(Opc, SL, VTList, Args);
+ }
+ case ISD::ADDCARRY: {
+ // add x, (addcarry y, 0, cc) => addcarry x, y, cc
+ auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ if (!C || C->getZExtValue() != 0) break;
+ SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
+ return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
+ }
+ }
+ return SDValue();
+}
+
+SDValue SITargetLowering::performSubCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i32)
+ return SDValue();
+
+ SDLoc SL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ unsigned Opc = LHS.getOpcode();
+ if (Opc != ISD::SUBCARRY)
+ std::swap(RHS, LHS);
+
+ if (LHS.getOpcode() == ISD::SUBCARRY) {
+ // sub (subcarry x, 0, cc), y => subcarry x, y, cc
+ auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ if (!C || C->getZExtValue() != 0)
+ return SDValue();
+ SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
+ return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
+ }
+ return SDValue();
+}
+
+SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+
+ if (N->getValueType(0) != MVT::i32)
+ return SDValue();
+
+ auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C || C->getZExtValue() != 0)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+
+ // addcarry (add x, y), 0, cc => addcarry x, y, cc
+ // subcarry (sub x, y), 0, cc => subcarry x, y, cc
+ unsigned LHSOpc = LHS.getOpcode();
+ unsigned Opc = N->getOpcode();
+ if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
+ (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
+ SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
+ return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
+ }
+ return SDValue();
+}
+
SDValue SITargetLowering::performFAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
@@ -5009,6 +5182,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
default:
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+ case ISD::ADD:
+ return performAddCombine(N, DCI);
+ case ISD::SUB:
+ return performSubCombine(N, DCI);
+ case ISD::ADDCARRY:
+ case ISD::SUBCARRY:
+ return performAddCarrySubCarryCombine(N, DCI);
case ISD::FADD:
return performFAddCombine(N, DCI);
case ISD::FSUB:
@@ -5425,15 +5605,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
}
-SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
- const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const {
- SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
-
- return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
- cast<RegisterSDNode>(VReg)->getReg(), VT);
-}
-
//===----------------------------------------------------------------------===//
// SI Inline Assembly Support
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 8e2ec40b224cd..24f88e632d38e 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -108,6 +108,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
+ SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -216,8 +219,6 @@ public:
void AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const override;
- SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const override;
SDNode *legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL,
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1097814e99ce2..c9b48fea7225e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2108,7 +2108,9 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
uint8_t OperandType) const {
- if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
+ if (!MO.isImm() ||
+ OperandType < AMDGPU::OPERAND_SRC_FIRST ||
+ OperandType > AMDGPU::OPERAND_SRC_LAST)
return false;
// MachineOperand provides no way to tell the true operand size, since it only
@@ -2433,8 +2435,73 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ // Verify SDWA
+ if (isSDWA(MI)) {
+
+ if (!ST.hasSDWA()) {
+ ErrInfo = "SDWA is not supported on this target";
+ return false;
+ }
+
+ int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+ if ( DstIdx == -1)
+ DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::sdst);
+
+ const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
+
+ for (int OpIdx: OpIndicies) {
+ if (OpIdx == -1)
+ continue;
+ const MachineOperand &MO = MI.getOperand(OpIdx);
+
+ if (!ST.hasSDWAScalar()) {
+ // Only VGPRS on VI
+ if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
+ ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
+ return false;
+ }
+ } else {
+ // No immediates on GFX9
+ if (!MO.isReg()) {
+ ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
+ return false;
+ }
+ }
+ }
+
+ if (!ST.hasSDWAOmod()) {
+ // No omod allowed on VI
+ const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
+ if (OMod != nullptr &&
+ (!OMod->isImm() || OMod->getImm() != 0)) {
+ ErrInfo = "OMod not allowed in SDWA instructions on VI";
+ return false;
+ }
+ }
+
+ uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+ if (isVOPC(BasicOpcode)) {
+ if (!ST.hasSDWASdst() && DstIdx != -1) {
+ // Only vcc allowed as dst on VI for VOPC
+ const MachineOperand &Dst = MI.getOperand(DstIdx);
+ if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
+ ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
+ return false;
+ }
+ } else if (!ST.hasSDWAClampVOPC()) {
+ // No clamp allowed on GFX9 for VOPC
+ const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
+ if (Clamp != nullptr &&
+ (!Clamp->isImm() || Clamp->getImm() != 0)) {
+ ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
+ return false;
+ }
+ }
+ }
+ }
+
// Verify VOP*
- if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
+ if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
// Only look at the true operands. Only a real operand can use the constant
// bus, and we don't want to check pseudo-operands like the source modifier
// flags.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index f6e5e8883f63c..74b48c7618087 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -814,6 +814,9 @@ namespace AMDGPU {
int getSDWAOp(uint16_t Opcode);
LLVM_READONLY
+ int getBasicFromSDWAOp(uint16_t Opcode);
+
+ LLVM_READONLY
int getCommuteRev(uint16_t Opcode);
LLVM_READONLY
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 470a47b024433..3b4a8b5d1e817 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -20,6 +20,8 @@ def SIEncodingFamily {
int NONE = -1;
int SI = 0;
int VI = 1;
+ int SDWA = 2;
+ int SDWA9 = 3;
}
//===----------------------------------------------------------------------===//
@@ -39,25 +41,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
-def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
- SDTypeProfile<0, 13,
- [SDTCisVT<0, v4i32>, // rsrc(SGPR)
- SDTCisVT<1, iAny>, // vdata(VGPR)
- SDTCisVT<2, i32>, // num_channels(imm)
- SDTCisVT<3, i32>, // vaddr(VGPR)
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
+ SDTypeProfile<1, 9,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex(VGPR)
+ SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
- SDTCisVT<5, i32>, // inst_offset(imm)
+ SDTCisVT<5, i32>, // offset(imm)
SDTCisVT<6, i32>, // dfmt(imm)
SDTCisVT<7, i32>, // nfmt(imm)
- SDTCisVT<8, i32>, // offen(imm)
- SDTCisVT<9, i32>, // idxen(imm)
- SDTCisVT<10, i32>, // glc(imm)
- SDTCisVT<11, i32>, // slc(imm)
- SDTCisVT<12, i32> // tfe(imm)
+ SDTCisVT<8, i32>, // glc(imm)
+ SDTCisVT<9, i32> // slc(imm)
]>,
- [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;
+def SDTtbuffer_store : SDTypeProfile<0, 10,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // vindex(VGPR)
+ SDTCisVT<3, i32>, // voffset(VGPR)
+ SDTCisVT<4, i32>, // soffset(SGPR)
+ SDTCisVT<5, i32>, // offset(imm)
+ SDTCisVT<6, i32>, // dfmt(imm)
+ SDTCisVT<7, i32>, // nfmt(imm)
+ SDTCisVT<8, i32>, // glc(imm)
+ SDTCisVT<9, i32> // slc(imm)
+ ]>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
+ SDTtbuffer_store,
+ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+
def SDTBufferLoad : SDTypeProfile<1, 5,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
@@ -452,25 +470,25 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
let ParserMatchClass = VReg32OrOffClass;
}
-class SDWA9Src : RegisterOperand<VS_32> {
+class SDWASrc : RegisterOperand<VS_32> {
let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_SDWA9_SRC";
- let EncoderMethod = "getSDWA9SrcEncoding";
+ let OperandType = "OPERAND_SDWA_SRC";
+ let EncoderMethod = "getSDWASrcEncoding";
}
-def SDWA9Src32 : SDWA9Src {
- let DecoderMethod = "decodeSDWA9Src32";
+def SDWASrc32 : SDWASrc {
+ let DecoderMethod = "decodeSDWASrc32";
}
-def SDWA9Src16 : SDWA9Src {
- let DecoderMethod = "decodeSDWA9Src16";
+def SDWASrc16 : SDWASrc {
+ let DecoderMethod = "decodeSDWASrc16";
}
-def SDWA9VopcDst : VOPDstOperand<SReg_64> {
+def SDWAVopcDst : VOPDstOperand<SReg_64> {
let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_SDWA9_VOPC_DST";
- let EncoderMethod = "getSDWA9VopcDstEncoding";
- let DecoderMethod = "decodeSDWA9VopcDst";
+ let OperandType = "OPERAND_SDWA_VOPC_DST";
+ let EncoderMethod = "getSDWAVopcDstEncoding";
+ let DecoderMethod = "decodeSDWAVopcDst";
}
class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
@@ -525,7 +543,7 @@ def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>;
-def offset_s13 : NamedOperandS13<"Offset", NamedMatchClass<"OffsetS13">>;
+def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>;
def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
@@ -545,6 +563,9 @@ def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
+def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
+def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
+
def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
@@ -634,13 +655,13 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
-def FPRegInputModsMatchClass : AsmOperandClass {
- let Name = "RegWithFPInputMods";
+def FPRegSDWAInputModsMatchClass : AsmOperandClass {
+ let Name = "SDWARegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
- let PredicateMethod = "isRegKind";
+ let PredicateMethod = "isSDWARegKind";
}
-def FPRegInputMods : InputMods <FPRegInputModsMatchClass> {
+def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> {
let PrintMethod = "printOperandAndFPInputMods";
}
@@ -655,13 +676,13 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
}
-def IntRegInputModsMatchClass : AsmOperandClass {
- let Name = "RegWithIntInputMods";
+def IntRegSDWAInputModsMatchClass : AsmOperandClass {
+ let Name = "SDWARegWithIntInputMods";
let ParserMethod = "parseRegWithIntInputMods";
- let PredicateMethod = "isRegKind";
+ let PredicateMethod = "isSDWARegKind";
}
-def IntRegInputMods : InputMods <IntRegInputModsMatchClass> {
+def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> {
let PrintMethod = "printOperandAndIntInputMods";
}
@@ -851,10 +872,10 @@ class getVALUDstForVT<ValueType VT> {
}
// Returns the register class to use for the destination of VOP[12C]
-// instructions with GFX9 SDWA extension
-class getSDWA9DstForVT<ValueType VT> {
+// instructions with SDWA extension
+class getSDWADstForVT<ValueType VT> {
RegisterOperand ret = !if(!eq(VT.Size, 1),
- SDWA9VopcDst, // VOPC
+ SDWAVopcDst, // VOPC
VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst
}
@@ -898,8 +919,8 @@ class getVregSrcForVT<ValueType VT> {
!if(!eq(VT.Size, 64), VReg_64, VGPR_32));
}
-class getSDWA9SrcForVT <ValueType VT> {
- RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32);
+class getSDWASrcForVT <ValueType VT> {
+ RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32);
}
// Returns the register class to use for sources of VOP3 instructions for the
@@ -995,7 +1016,7 @@ class getSrcMod <ValueType VT> {
);
}
-// Return type of input modifiers operand specified input operand for SDWA/DPP
+// Return type of input modifiers operand specified input operand for DPP
class getSrcModExt <ValueType VT> {
bit isFP = !if(!eq(VT.Value, f16.Value), 1,
!if(!eq(VT.Value, f32.Value), 1,
@@ -1004,13 +1025,13 @@ class getSrcModExt <ValueType VT> {
Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
}
-// Return type of input modifiers operand specified input operand for SDWA 9
-class getSrcModSDWA9 <ValueType VT> {
+// Return type of input modifiers operand specified input operand for SDWA
+class getSrcModSDWA <ValueType VT> {
bit isFP = !if(!eq(VT.Value, f16.Value), 1,
!if(!eq(VT.Value, f32.Value), 1,
!if(!eq(VT.Value, f64.Value), 1,
0)));
- Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods);
+ Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods);
}
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1141,36 +1162,12 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
/* endif */)));
}
-class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
- bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod,
- ValueType DstVT> {
- dag ret = !if(!eq(NumSrcArgs, 0),
- // VOP1 without input operands (V_NOP)
- (ins),
- !if(!eq(NumSrcArgs, 1),
- // VOP1_SDWA
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel),
- !if(!eq(NumSrcArgs, 2),
- !if(!eq(DstVT.Size, 1),
- // VOPC_SDWA with modifiers
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
- // VOP2_SDWA with modifiers
- (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
- Src1Mod:$src1_modifiers, Src1RC:$src1,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel, src1_sel:$src1_sel)),
- (ins)/* endif */)));
-}
-// Ins for GFX9 SDWA
-class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
- bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
- ValueType DstVT> {
+// Ins for SDWA
+class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
+ bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
+ ValueType DstVT> {
dag ret = !if(!eq(NumSrcArgs, 0),
// VOP1 without input operands (V_NOP)
@@ -1178,31 +1175,31 @@ class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArg
!if(!eq(NumSrcArgs, 1),
// VOP1
!if(!eq(HasSDWAOMod, 0),
- // VOP1_SDWA9 without omod
+ // VOP1_SDWA without omod
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod:$clamp,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel),
- // VOP1_SDWA9 with omod
+ // VOP1_SDWA with omod
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod:$clamp, omod:$omod,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel)),
!if(!eq(NumSrcArgs, 2),
!if(!eq(DstVT.Size, 1),
- // VOPC_SDWA9
+ // VOPC_SDWA
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
- src0_sel:$src0_sel, src1_sel:$src1_sel),
- // VOP2_SDWA9
+ clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+ // VOP2_SDWA
!if(!eq(HasSDWAOMod, 0),
- // VOP2_SDWA9 without omod
+ // VOP2_SDWA without omod
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod:$clamp,
dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel),
- // VOP1_SDWA9 with omod
+ // VOP2_SDWA with omod
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod:$clamp, omod:$omod,
@@ -1220,12 +1217,12 @@ class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {
(outs)); // V_NOP
}
-// Outs for GFX9 SDWA
-class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> {
+// Outs for SDWA
+class getOutsSDWA <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA> {
dag ret = !if(HasDst,
!if(!eq(DstVT.Size, 1),
- (outs DstRCSDWA9:$sdst),
- (outs DstRCSDWA9:$vdst)),
+ (outs DstRCSDWA:$sdst),
+ (outs DstRCSDWA:$vdst)),
(outs)); // V_NOP
}
@@ -1387,8 +1384,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
field ValueType Src2VT = ArgVT[3];
field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
- field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
- field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret;
+ field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret;
field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
@@ -1396,19 +1392,15 @@ class VOPProfile <list<ValueType> _ArgVT> {
field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
- field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
- field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
- field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
- field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
+ field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
+ field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
field Operand Src0Mod = getSrcMod<Src0VT>.ret;
field Operand Src1Mod = getSrcMod<Src1VT>.ret;
field Operand Src2Mod = getSrcMod<Src2VT>.ret;
field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
- field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
- field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
- field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret;
- field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret;
+ field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
+ field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
@@ -1457,8 +1449,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag Outs32 = Outs;
field dag Outs64 = Outs;
field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
- field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
- field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret;
+ field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
@@ -1471,11 +1462,9 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
- HasModifiers, Src0ModSDWA, Src1ModSDWA,
+ HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
DstVT>.ret;
- field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs,
- HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9,
- DstVT>.ret;
+
field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;
@@ -1628,13 +1617,13 @@ def getSDWAOp : InstrMapping {
let ValueCols = [["SDWA"]];
}
-// Maps ordinary instructions to their SDWA GFX9 counterparts
-def getSDWA9Op : InstrMapping {
+// Maps SDWA instructions to their ordinary counterparts
+def getBasicFromSDWAOp : InstrMapping {
let FilterClass = "VOP";
let RowFields = ["OpName"];
let ColFields = ["AsmVariantName"];
- let KeyCol = ["Default"];
- let ValueCols = [["SDWA9"]];
+ let KeyCol = ["SDWA"];
+ let ValueCols = [["Default"]];
}
def getMaskedMIMGOp : InstrMapping {
@@ -1669,7 +1658,9 @@ def getMCOpcodeGen : InstrMapping {
let ColFields = ["Subtarget"];
let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
- [!cast<string>(SIEncodingFamily.VI)]];
+ [!cast<string>(SIEncodingFamily.VI)],
+ [!cast<string>(SIEncodingFamily.SDWA)],
+ [!cast<string>(SIEncodingFamily.SDWA9)]];
}
// Get equivalent SOPK instruction.
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 18b197ddb7ae7..3203c38dae344 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -74,7 +74,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDX(false),
WorkItemIDY(false),
WorkItemIDZ(false),
- PrivateMemoryInputPtr(false) {
+ ImplicitBufferPtr(false) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const Function *F = MF.getFunction();
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
@@ -86,6 +86,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
ScratchWaveOffsetReg = AMDGPU::SGPR4;
FrameOffsetReg = AMDGPU::SGPR5;
+ StackPtrOffsetReg = AMDGPU::SGPR32;
return;
}
@@ -150,7 +151,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
DispatchID = true;
} else if (ST.isMesaGfxShader(MF)) {
if (HasStackObjects || MaySpill)
- PrivateMemoryInputPtr = true;
+ ImplicitBufferPtr = true;
}
// We don't need to worry about accessing spills with flat instructions.
@@ -203,11 +204,11 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
return FlatScratchInitUserSGPR;
}
-unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
- PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
+unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
+ ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
NumUserSGPRs += 2;
- return PrivateMemoryPtrUserSGPR;
+ return ImplicitBufferPtrUserSGPR;
}
/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9fdb8caac6f21..05aa249584bf1 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -97,7 +97,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned StackPtrOffsetReg;
// Input registers for non-HSA ABI
- unsigned PrivateMemoryPtrUserSGPR;
+ unsigned ImplicitBufferPtrUserSGPR;
// Input registers setup for the HSA ABI.
// User SGPRs in allocation order.
@@ -179,7 +179,7 @@ private:
// Private memory buffer
// Compute directly in sgpr[0:1]
// Other shaders indirect 64-bits at sgpr[0:1]
- bool PrivateMemoryInputPtr : 1;
+ bool ImplicitBufferPtr : 1;
MCPhysReg getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
@@ -236,7 +236,7 @@ public:
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
unsigned addDispatchID(const SIRegisterInfo &TRI);
unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
- unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
+ unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI);
// Add system SGPRs.
unsigned addWorkGroupIDX() {
@@ -341,8 +341,8 @@ public:
return WorkItemIDZ;
}
- bool hasPrivateMemoryInputPtr() const {
- return PrivateMemoryInputPtr;
+ bool hasImplicitBufferPtr() const {
+ return ImplicitBufferPtr;
}
unsigned getNumUserSGPRs() const {
@@ -396,8 +396,8 @@ public:
return QueuePtrUserSGPR;
}
- unsigned getPrivateMemoryPtrUserSGPR() const {
- return PrivateMemoryPtrUserSGPR;
+ unsigned getImplicitBufferPtrUserSGPR() const {
+ return ImplicitBufferPtrUserSGPR;
}
bool hasSpilledSGPRs() const {
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index f4ddf1891683b..4ac23ef03cb32 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -67,9 +67,9 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineFunction &MF);
- bool isConvertibleToSDWA(const MachineInstr &MI) const;
+ bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
- void legalizeScalarOperands(MachineInstr &MI) const;
+ void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
StringRef getPassName() const override { return "SI Peephole SDWA"; }
@@ -224,7 +224,7 @@ static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
static bool isSubregOf(const MachineOperand &SubReg,
const MachineOperand &SuperReg,
const TargetRegisterInfo *TRI) {
-
+
if (!SuperReg.isReg() || !SubReg.isReg())
return false;
@@ -557,7 +557,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
+
if (TRI->isPhysicalRegister(Src0->getReg()) ||
TRI->isPhysicalRegister(Dst->getReg()))
break;
@@ -590,7 +590,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
break;
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
+
if (TRI->isPhysicalRegister(Src1->getReg()) ||
TRI->isPhysicalRegister(Dst->getReg()))
break;
@@ -607,16 +607,38 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
}
}
-bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
+bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
+ const SISubtarget &ST) const {
// Check if this instruction has opcode that supports SDWA
- unsigned Opc = MI.getOpcode();
- if (AMDGPU::getSDWAOp(Opc) != -1)
- return true;
- int Opc32 = AMDGPU::getVOPe32(Opc);
- if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1)
- return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
- !TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
- return false;
+ int Opc = MI.getOpcode();
+ if (AMDGPU::getSDWAOp(Opc) == -1)
+ Opc = AMDGPU::getVOPe32(Opc);
+
+ if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1)
+ return false;
+
+ if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+ return false;
+
+ if (TII->isVOPC(Opc)) {
+ if (!ST.hasSDWASdst()) {
+ const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+ if (SDst && SDst->getReg() != AMDGPU::VCC)
+ return false;
+ }
+
+ if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+ return false;
+
+ } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+ return false;
+ }
+
+ if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
+ Opc == AMDGPU::V_MAC_F32_e32))
+ return false;
+
+ return true;
}
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
@@ -641,6 +663,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
if (Dst) {
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
SDWAInst.add(*Dst);
+ } else {
+ Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+ assert(Dst &&
+ AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
+ SDWAInst.add(*Dst);
}
// Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
@@ -677,9 +704,23 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
SDWAInst.add(*Src2);
}
- // Initialize clamp.
+ // Copy clamp if present, initialize otherwise
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
- SDWAInst.addImm(0);
+ MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
+ if (Clamp) {
+ SDWAInst.add(*Clamp);
+ } else {
+ SDWAInst.addImm(0);
+ }
+
+ // Copy omod if present, initialize otherwise if needed
+ MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
+ if (OMod) {
+ assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1);
+ SDWAInst.add(*OMod);
+ } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
+ SDWAInst.addImm(0);
+ }
// Initialize dst_sel and dst_unused if present
if (Dst) {
@@ -733,16 +774,25 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
}
// If an instruction was converted to SDWA it should not have immediates or SGPR
-// operands. Copy its scalar operands into VGPRs.
-void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
+// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
+void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
- for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
- MachineOperand &Op = MI.getOperand(I);
+ unsigned ConstantBusCount = 0;
+ for (MachineOperand &Op: MI.explicit_uses()) {
if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
continue;
+
+ unsigned I = MI.getOperandNo(&Op);
if (Desc.OpInfo[I].RegClass == -1 ||
!TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
continue;
+
+ if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
+ TRI->isSGPRReg(*MRI, Op.getReg())) {
+ ++ConstantBusCount;
+ continue;
+ }
+
unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
@@ -758,22 +808,20 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- if (!ST.hasSDWA() ||
- !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
+ if (!ST.hasSDWA())
return false;
- }
MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();
-
+
// Find all SDWA operands in MF.
matchSDWAOperands(MF);
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
@@ -788,7 +836,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
bool Ret = !ConvertedInstructions.empty();
while (!ConvertedInstructions.empty())
- legalizeScalarOperands(*ConvertedInstructions.pop_back_val());
+ legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
return Ret;
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index b611f28fcabdf..ef6ad4ad0c8f3 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1044,18 +1044,29 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned CarryOut
= MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
unsigned ScaledReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // XXX - Should this use a vector shift?
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
- .addReg(DiffReg, RegState::Kill)
- .addImm(Log2_32(ST.getWavefrontSize()));
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
+ .addImm(Log2_32(ST.getWavefrontSize()))
+ .addReg(DiffReg, RegState::Kill);
// TODO: Fold if use instruction is another add of a constant.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
- .addReg(CarryOut, RegState::Define | RegState::Dead)
- .addImm(Offset)
- .addReg(ScaledReg, RegState::Kill);
+ if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+ .addReg(CarryOut, RegState::Define | RegState::Dead)
+ .addImm(Offset)
+ .addReg(ScaledReg, RegState::Kill);
+ } else {
+ unsigned ConstOffsetReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+ .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+ .addReg(CarryOut, RegState::Define | RegState::Dead)
+ .addReg(ConstOffsetReg, RegState::Kill)
+ .addReg(ScaledReg, RegState::Kill);
+ }
MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC);
}
@@ -1341,12 +1352,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
- if (ST.isAmdCodeObjectV2(MF)) {
- assert(MFI->hasPrivateSegmentBuffer());
- return MFI->PrivateSegmentBufferUserSGPR;
- }
- assert(MFI->hasPrivateMemoryInputPtr());
- return MFI->PrivateMemoryPtrUserSGPR;
+ assert(MFI->hasPrivateSegmentBuffer());
+ return MFI->PrivateSegmentBufferUserSGPR;
+ case SIRegisterInfo::IMPLICIT_BUFFER_PTR:
+ assert(MFI->hasImplicitBufferPtr());
+ return MFI->ImplicitBufferPtrUserSGPR;
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
assert(MFI->hasKernargSegmentPtr());
return MFI->KernargSegmentPtrUserSGPR;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 8fed6d5f9710f..600cc886cb595 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -197,12 +197,13 @@ public:
WORKGROUP_ID_Y = 11,
WORKGROUP_ID_Z = 12,
PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+ IMPLICIT_BUFFER_PTR = 15,
// VGPRS:
- FIRST_VGPR_VALUE = 15,
+ FIRST_VGPR_VALUE = 16,
WORKITEM_ID_X = FIRST_VGPR_VALUE,
- WORKITEM_ID_Y = 16,
- WORKITEM_ID_Z = 17
+ WORKITEM_ID_Y = 17,
+ WORKITEM_ID_Z = 18
};
/// \brief Returns the physical register that \p Value is stored in.
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index c5f121757e623..96a18544f02ac 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -92,6 +92,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_SUBB_U32_e64:
+ if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
+ return false;
// Additional verification is needed for sdst/src2.
return true;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index f581e69980c79..26515b27bb77d 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -538,6 +538,27 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
Reg == AMDGPU::SCC;
}
+bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
+
+ if (Reg0 == Reg1) {
+ return true;
+ }
+
+ unsigned SubReg0 = TRI->getSubReg(Reg0, 1);
+ if (SubReg0 == 0) {
+ return TRI->getSubRegIndex(Reg1, Reg0) > 0;
+ }
+
+ for (unsigned Idx = 2; SubReg0 > 0; ++Idx) {
+ if (isRegIntersect(Reg1, SubReg0, TRI)) {
+ return true;
+ }
+ SubReg0 = TRI->getSubReg(Reg0, Idx);
+ }
+
+ return false;
+}
+
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
switch(Reg) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index eff0230d21f57..936e4921a7097 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -271,6 +271,9 @@ bool isGFX9(const MCSubtargetInfo &STI);
/// \brief Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
+/// \brief Is there any intersection between registers
+bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
+
/// If \p Reg is a pseudo reg, return the correct hardware register given
/// \p STI otherwise return \p Reg.
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 95b5ef0a49dba..96b33c373f052 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -93,11 +93,6 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOP1";
}
-class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
- VOP_SDWA9_Pseudo <OpName, P, pattern> {
- let AsmMatchConverter = "cvtSdwaVOP1";
-}
-
class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret =
!if(P.HasModifiers,
@@ -117,7 +112,6 @@ multiclass VOP1Inst <string opName, VOPProfile P,
def _e32 : VOP1_Pseudo <opName, P>;
def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
- def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>;
}
// Special profile for instructions which have clamp
@@ -274,12 +268,10 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel);
- let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
- clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel);
let Asm32 = getAsm32<1, 1>.ret;
let Asm64 = getAsm64<1, 1, 0, 1>.ret;
@@ -545,8 +537,8 @@ multiclass VOP1_Real_vi <bits<10> op> {
VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>,
- VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+ VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
// For now left dpp only for asm/dasm
// TODO: add corresponding pseudo
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 657cacaa792ca..7b9bc71ad4c77 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -114,11 +114,6 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOP2";
}
-class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
- VOP_SDWA9_Pseudo <OpName, P, pattern> {
- let AsmMatchConverter = "cvtSdwaVOP2";
-}
-
class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set P.DstVT:$vdst,
@@ -139,7 +134,6 @@ multiclass VOP2Inst <string opName,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
- def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>;
}
multiclass VOP2bInst <string opName,
@@ -156,10 +150,6 @@ multiclass VOP2bInst <string opName,
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
let AsmMatchConverter = "cvtSdwaVOP2b";
}
-
- def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> {
- let AsmMatchConverter = "cvtSdwaVOP2b";
- }
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -221,17 +211,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
VGPR_32:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
VGPR_32:$src2, // stub argument
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ clampmod:$clamp, omod:$omod,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
- let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
- Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
- VGPR_32:$src2, // stub argument
- clampmod:$clamp, omod:$omod,
- dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel, src1_sel:$src1_sel);
let Asm32 = getAsm32<1, 2, vt>.ret;
let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;
let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
@@ -289,15 +275,10 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
- clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ clampmod:$clamp, omod:$omod,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
- let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
- Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
- clampmod:$clamp, omod:$omod,
- dst_sel:$dst_sel, dst_unused:$dst_unused,
- src0_sel:$src0_sel, src1_sel:$src1_sel);
-
let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
Src1Mod:$src1_modifiers, Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
@@ -326,6 +307,8 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
+ let HasExt = 0;
+ let HasSDWA9 = 0;
}
def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
@@ -335,6 +318,8 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
+ let HasExt = 0;
+ let HasSDWA9 = 0;
}
//===----------------------------------------------------------------------===//
@@ -397,20 +382,29 @@ def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
} // End isConvergent = 1
-defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
-defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>;
-defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
-defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
-defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
+defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
} // End SubtargetPredicate = isGCN
+def : Pat<
+ (AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
+ (V_ADDC_U32_e64 $src0, $src1, $src2)
+>;
+
+def : Pat<
+ (AMDGPUsube i32:$src0, i32:$src1, i1:$src2),
+ (V_SUBB_U32_e64 $src0, $src1, $src2)
+>;
// These instructions only exist on SI and CI
let SubtargetPredicate = isSICI in {
@@ -728,8 +722,8 @@ multiclass VOP2_SDWA_Real <bits<6> op> {
multiclass VOP2_SDWA9_Real <bits<6> op> {
def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>,
- VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+ VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index cd347b86d3050..f3482a22d5dcd 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -113,11 +113,6 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOPC";
}
-class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
- VOP_SDWA9_Pseudo <OpName, P, pattern> {
- let AsmMatchConverter = "cvtSdwaVOPC";
-}
-
// This class is used only with VOPC instructions. Use $sdst for out operand
class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
@@ -189,13 +184,6 @@ multiclass VOPC_Pseudos <string opName,
let isConvergent = DefExec;
let isCompare = 1;
}
-
- def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
- let SchedRW = P.Schedule;
- let isConvergent = DefExec;
- let isCompare = 1;
- }
}
def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
@@ -540,14 +528,12 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
VOPC_Profile<sched, vt, i32> {
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
let Asm64 = "$sdst, $src0_modifiers, $src1";
+
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
- let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
- Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
- src0_sel:$src0_sel, src1_sel:$src1_sel);
+
let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
- //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
let HasSrc1Mods = 0;
let HasClamp = 0;
let HasOMod = 0;
@@ -580,12 +566,6 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
let SchedRW = p.Schedule;
let isConvergent = DefExec;
}
-
- def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> {
- let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
- let SchedRW = p.Schedule;
- let isConvergent = DefExec;
- }
}
def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
@@ -954,8 +934,8 @@ multiclass VOPC_Real_vi <bits<10> op> {
VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>,
- VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+ VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
!cast<Instruction>(NAME#"_e32_vi")> {
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 4da654f84f9d1..e386f21c2ba49 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -232,11 +232,11 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
- let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0)
- let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1)
- let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
+ let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
+ let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
+ let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
- let Inst{14} = !if(P.HasOpSel, src2_modifiers{3}, 0); // op_sel_hi(2)
+ let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, 0); // op_sel_hi(2)
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
@@ -245,8 +245,8 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
let Inst{40-32} = !if(P.HasSrc0, src0, 0);
let Inst{49-41} = !if(P.HasSrc1, src1, 0);
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
- let Inst{59} = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel_hi(0)
- let Inst{60} = !if(P.HasOpSel, src1_modifiers{3}, 0); // op_sel_hi(1)
+ let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, 0); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, 0); // op_sel_hi(1)
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
@@ -300,6 +300,19 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
}
+// GFX9 adds two features to SDWA:
+// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
+// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
+// than VGPRs (at most 1 can be an SGPR);
+// b. OMOD is the standard output modifier (result *2, *4, /2)
+// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This
+// replaces OMOD and the dest fields with SD and SDST (SGPR destination)
+// field.
+// a. When SD=1, the SDST is used as the destination for the compare result;
+// b. When SD=0, VCC is used.
+//
+// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
+
// gfx9 SDWA basic encoding
class VOP_SDWA9e<VOPProfile P> : Enc64 {
bits<9> src0; // {src0_sgpr{0}, src0{7-0}}
@@ -353,6 +366,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
string Mnemonic = opName;
string AsmOperands = P.AsmSDWA;
+ string AsmOperands9 = P.AsmSDWA9;
let Size = 8;
let mayLoad = 0;
@@ -372,53 +386,9 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
VOPProfile Pfl = P;
}
-// GFX9 adds two features to SDWA:
-// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
-// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
-// than VGPRs (at most 1 can be an SGPR);
-// b. OMOD is the standard output modifier (result *2, *4, /2)
-// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This
-// replaces OMOD and the dest fields with SD and SDST (SGPR destination)
-// field.
-// a. When SD=1, the SDST is used as the destination for the compare result;
-// b.when SD=0, VCC is used.
-//
-// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
-
-class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
- InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>,
- VOP <opName>,
- SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>,
- MnemonicAlias <opName#"_sdwa9", opName> {
-
- let isPseudo = 1;
- let isCodeGenOnly = 1;
- let UseNamedOperandTable = 1;
-
- string Mnemonic = opName;
- string AsmOperands = P.AsmSDWA9;
-
- let Size = 8;
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
-
- let VALU = 1;
- let SDWA = 1;
- let Uses = [EXEC];
-
- let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
- let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
- let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9,
- AMDGPUAsmVariants.Disable);
- let DecoderNamespace = "SDWA9";
-
- VOPProfile Pfl = P;
-}
-
class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
- SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -443,9 +413,9 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
let TSFlags = ps.TSFlags;
}
-class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :
- InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
- SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9> {
let isPseudo = 0;
let isCodeGenOnly = 0;
@@ -458,13 +428,15 @@ class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
+ let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
+ let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
+ let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9,
+ AMDGPUAsmVariants.Disable);
+ let DecoderNamespace = "SDWA9";
+
// Copy relevant pseudo op flags
- let SubtargetPredicate = ps.SubtargetPredicate;
- let AssemblerPredicate = ps.AssemblerPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
- let AsmVariantName = ps.AsmVariantName;
let UseNamedOperandTable = ps.UseNamedOperandTable;
- let DecoderNamespace = ps.DecoderNamespace;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;