aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td3
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp96
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp6
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h5
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td5
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td16
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp6
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp74
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td75
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp4
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp24
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp1
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp185
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp120
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h2
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp132
-rw-r--r--lib/Target/AMDGPU/SIInsertWaitcnts.cpp4
-rw-r--r--lib/Target/AMDGPU/SIInsertWaits.cpp10
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp58
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h4
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td3
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td40
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp2
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h24
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp62
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td36
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp9
27 files changed, 702 insertions, 304 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 2c7a2d8962d0..0f331486d0f8 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -406,7 +406,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
- FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode
+ FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
+ FeatureFastFMAF32
]
>;
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 318de7f2e3d2..f5110857da84 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -116,8 +116,11 @@ private:
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
SDValue &SLC) const;
- bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
- SDValue &SOffset, SDValue &ImmOffset) const;
+ bool SelectMUBUFScratchOffen(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+ SDValue &SOffset, SDValue &ImmOffset) const;
+ bool SelectMUBUFScratchOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+ SDValue &Offset) const;
+
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
SDValue &Offset, SDValue &GLC, SDValue &SLC,
SDValue &TFE) const;
@@ -150,14 +153,12 @@ private:
bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp, SDValue &Omod) const;
bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp, SDValue &Omod) const;
- bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
- SDValue &Omod) const;
bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp,
SDValue &Omod) const;
@@ -953,8 +954,12 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
return true;
}
+static bool isLegalMUBUFImmOffset(unsigned Imm) {
+ return isUInt<12>(Imm);
+}
+
static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
- return isUInt<12>(Imm->getZExtValue());
+ return isLegalMUBUFImmOffset(Imm->getZExtValue());
}
bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
@@ -1076,9 +1081,9 @@ SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
return N;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
- SDValue &VAddr, SDValue &SOffset,
- SDValue &ImmOffset) const {
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc,
+ SDValue &VAddr, SDValue &SOffset,
+ SDValue &ImmOffset) const {
SDLoc DL(Addr);
MachineFunction &MF = CurDAG->getMachineFunction();
@@ -1087,8 +1092,22 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
- // (add n0, c1)
+ if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ unsigned Imm = CAddr->getZExtValue();
+ assert(!isLegalMUBUFImmOffset(Imm) &&
+ "should have been selected by other pattern");
+
+ SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
+ MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ DL, MVT::i32, HighBits);
+ VAddr = SDValue(MovHighBits, 0);
+ ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
+ return true;
+ }
+
if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ // (add n0, c1)
+
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
@@ -1107,6 +1126,24 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDValue Addr,
+ SDValue &SRsrc,
+ SDValue &SOffset,
+ SDValue &Offset) const {
+ ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
+ if (!CAddr || !isLegalMUBUFImmOffset(CAddr))
+ return false;
+
+ SDLoc DL(Addr);
+ MachineFunction &MF = CurDAG->getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+ SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
+ Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &SOffset, SDValue &Offset,
SDValue &GLC, SDValue &SLC,
@@ -1628,38 +1665,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
return isNoNanSrc(Src);
}
-bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src,
- SDValue &SrcMods) const {
- bool Res = SelectVOP3Mods(In, Src, SrcMods);
- return Res && cast<ConstantSDNode>(SrcMods)->isNullValue();
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
+ if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
+ return false;
+
+ Src = In;
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
SDValue &SrcMods, SDValue &Clamp,
SDValue &Omod) const {
SDLoc DL(In);
- // FIXME: Handle Clamp and Omod
- Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
-
- return SelectVOP3Mods(In, Src, SrcMods);
-}
-
-bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src,
- SDValue &SrcMods, SDValue &Clamp,
- SDValue &Omod) const {
- bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod);
-
- return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() &&
- cast<ConstantSDNode>(Clamp)->isNullValue() &&
- cast<ConstantSDNode>(Omod)->isNullValue();
-}
-
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
- SDValue &SrcMods,
- SDValue &Omod) const {
- // FIXME: Handle Omod
- Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+ Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
return SelectVOP3Mods(In, Src, SrcMods);
}
@@ -1677,9 +1696,8 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
Src = In;
SDLoc DL(In);
- // FIXME: Handle Clamp and Omod
- Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c0f336e082bd..e21775e61dd4 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2315,12 +2315,13 @@ static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
SelectionDAG &DAG = DCI.DAG;
SDValue Op = Node24->getOperand(OpIdx);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = Op.getValueType();
APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
APInt KnownZero, KnownOne;
TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
- if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
+ if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
return true;
return false;
@@ -3361,7 +3362,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+ if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
TLI.SimplifyDemandedBits(BitsFrom, Demanded,
KnownZero, KnownOne, TLO)) {
DCI.CommitTargetLoweringOpt(TLO);
@@ -3436,6 +3437,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ELSE)
NODE_NAME_CASE(LOOP)
NODE_NAME_CASE(CALL)
+ NODE_NAME_CASE(TRAP)
NODE_NAME_CASE(RET_FLAG)
NODE_NAME_CASE(RETURN_TO_EPILOG)
NODE_NAME_CASE(ENDPGM)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index d6aa0ba92bf7..13cbfe267932 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -231,6 +231,10 @@ public:
AMDGPUAS getAMDGPUAS() const {
return AMDGPUASI;
}
+
+ MVT getFenceOperandTy(const DataLayout &DL) const override {
+ return MVT::i32;
+ }
};
namespace AMDGPUISD {
@@ -244,6 +248,7 @@ enum NodeType : unsigned {
// Function call.
CALL,
+ TRAP,
// Masked control flow nodes.
IF,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 56f060984f08..c1706d12a2ea 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -78,6 +78,11 @@ def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
+def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
+ SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
+ [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
+>;
+
def AMDGPUconstdata_ptr : SDNode<
"AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
SDTCisVT<0, iPTR>]>
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index b8d681298dee..4e688ab0b105 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -50,6 +50,16 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
+def u16ImmTarget : AsmOperandClass {
+ let Name = "U16Imm";
+ let RenderMethod = "addImmOperands";
+}
+
+def s16ImmTarget : AsmOperandClass {
+ let Name = "S16Imm";
+ let RenderMethod = "addImmOperands";
+}
+
let OperandType = "OPERAND_IMMEDIATE" in {
def u32imm : Operand<i32> {
@@ -58,6 +68,12 @@ def u32imm : Operand<i32> {
def u16imm : Operand<i16> {
let PrintMethod = "printU16ImmOperand";
+ let ParserMatchClass = u16ImmTarget;
+}
+
+def s16imm : Operand<i16> {
+ let PrintMethod = "printU16ImmOperand";
+ let ParserMatchClass = s16ImmTarget;
}
def u8imm : Operand<i8> {
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 14ee1c81f8fa..da247fea7de6 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -225,6 +225,12 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
+ if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) {
+ if (isVerbose())
+ OutStreamer->emitRawComment(" divergent unreachable");
+ return;
+ }
+
MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 961f7186f373..70c848f3c7bd 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -479,6 +479,8 @@ public:
bool isSMRDLiteralOffset() const;
bool isDPPCtrl() const;
bool isGPRIdxMode() const;
+ bool isS16Imm() const;
+ bool isU16Imm() const;
StringRef getExpressionAsToken() const {
assert(isExpr());
@@ -2836,6 +2838,28 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
// s_waitcnt
//===----------------------------------------------------------------------===//
+static bool
+encodeCnt(
+ const AMDGPU::IsaInfo::IsaVersion ISA,
+ int64_t &IntVal,
+ int64_t CntVal,
+ bool Saturate,
+ unsigned (*encode)(const IsaInfo::IsaVersion &Version, unsigned, unsigned),
+ unsigned (*decode)(const IsaInfo::IsaVersion &Version, unsigned))
+{
+ bool Failed = false;
+
+ IntVal = encode(ISA, IntVal, CntVal);
+ if (CntVal != decode(ISA, IntVal)) {
+ if (Saturate) {
+ IntVal = encode(ISA, IntVal, -1);
+ } else {
+ Failed = true;
+ }
+ }
+ return Failed;
+}
+
bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
StringRef CntName = Parser.getTok().getString();
int64_t CntVal;
@@ -2851,25 +2875,35 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
if (getParser().parseAbsoluteExpression(CntVal))
return true;
- if (getLexer().isNot(AsmToken::RParen))
- return true;
-
- Parser.Lex();
- if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
- Parser.Lex();
-
AMDGPU::IsaInfo::IsaVersion ISA =
AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
- if (CntName == "vmcnt")
- IntVal = encodeVmcnt(ISA, IntVal, CntVal);
- else if (CntName == "expcnt")
- IntVal = encodeExpcnt(ISA, IntVal, CntVal);
- else if (CntName == "lgkmcnt")
- IntVal = encodeLgkmcnt(ISA, IntVal, CntVal);
- else
- return true;
- return false;
+ bool Failed = true;
+ bool Sat = CntName.endswith("_sat");
+
+ if (CntName == "vmcnt" || CntName == "vmcnt_sat") {
+ Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeVmcnt, decodeVmcnt);
+ } else if (CntName == "expcnt" || CntName == "expcnt_sat") {
+ Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeExpcnt, decodeExpcnt);
+ } else if (CntName == "lgkmcnt" || CntName == "lgkmcnt_sat") {
+ Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeLgkmcnt, decodeLgkmcnt);
+ }
+
+ // To improve diagnostics, do not skip delimiters on errors
+ if (!Failed) {
+ if (getLexer().isNot(AsmToken::RParen)) {
+ return true;
+ }
+ Parser.Lex();
+ if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) {
+ const AsmToken NextToken = getLexer().peekTok();
+ if (NextToken.is(AsmToken::Identifier)) {
+ Parser.Lex();
+ }
+ }
+ }
+
+ return Failed;
}
OperandMatchResultTy
@@ -3858,6 +3892,14 @@ bool AMDGPUOperand::isGPRIdxMode() const {
return isImm() && isUInt<4>(getImm());
}
+bool AMDGPUOperand::isS16Imm() const {
+ return isImm() && (isInt<16>(getImm()) || isUInt<16>(getImm()));
+}
+
+bool AMDGPUOperand::isU16Imm() const {
+ return isImm() && isUInt<16>(getImm());
+}
+
OperandMatchResultTy
AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
SMLoc S = Parser.getTok().getLoc();
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index a6609f0725ab..89eddb9ce961 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -11,7 +11,9 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
-def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
+def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen">;
+def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [], 20>;
+
def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
@@ -958,21 +960,30 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
} // End Predicates = [Has16BitInsts]
-class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat <
- (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset))),
- (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
->;
+multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
+ MUBUF_Pseudo InstrOffset,
+ ValueType vt, PatFrag ld> {
+ def : Pat <
+ (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset))),
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+ >;
+
+ def : Pat <
+ (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0)
+ >;
+}
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
// BUFFER_LOAD_DWORD*, addr64=0
multiclass MUBUF_Load_Dword <ValueType vt,
@@ -1054,19 +1065,29 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>;
defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>;
-class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat <
- (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
- u16imm:$offset)),
- (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
->;
+multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
+ MUBUF_Pseudo InstrOffset,
+ ValueType vt, PatFrag st> {
+ def : Pat <
+ (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset)),
+ (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+ >;
+
+ def : Pat <
+ (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
+ u16imm:$offset)),
+ (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0)
+ >;
+}
-def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i32, truncstorei8_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
//===----------------------------------------------------------------------===//
// MTBUF Patterns
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 4ecfa118fb27..bf16a8216001 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -83,8 +83,8 @@ unsigned GCNRegPressure::getRegKind(unsigned Reg,
const auto RC = MRI.getRegClass(Reg);
auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
return STI->isSGPRClass(RC) ?
- (RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) :
- (RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE);
+ (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) :
+ (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
}
void GCNRegPressure::inc(unsigned Reg,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
index 29a6ab9fbe93..647017d5061d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
@@ -286,20 +286,20 @@ ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
return ValueKind::Pipe;
return StringSwitch<ValueKind>(BaseTypeName)
+ .Case("image1d_t", ValueKind::Image)
+ .Case("image1d_array_t", ValueKind::Image)
+ .Case("image1d_buffer_t", ValueKind::Image)
+ .Case("image2d_t", ValueKind::Image)
+ .Case("image2d_array_t", ValueKind::Image)
+ .Case("image2d_array_depth_t", ValueKind::Image)
+ .Case("image2d_array_msaa_t", ValueKind::Image)
+ .Case("image2d_array_msaa_depth_t", ValueKind::Image)
+ .Case("image2d_depth_t", ValueKind::Image)
+ .Case("image2d_msaa_t", ValueKind::Image)
+ .Case("image2d_msaa_depth_t", ValueKind::Image)
+ .Case("image3d_t", ValueKind::Image)
.Case("sampler_t", ValueKind::Sampler)
.Case("queue_t", ValueKind::Queue)
- .Cases("image1d_t",
- "image1d_array_t",
- "image1d_buffer_t",
- "image2d_t" ,
- "image2d_array_t",
- "image2d_array_depth_t",
- "image2d_array_msaa_t"
- "image2d_array_msaa_depth_t"
- "image2d_depth_t",
- "image2d_msaa_t",
- "image2d_msaa_depth_t",
- "image3d_t", ValueKind::Image)
.Default(isa<PointerType>(Ty) ?
(Ty->getPointerAddressSpace() ==
AMDGPUASI.LOCAL_ADDRESS ?
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 6c61fb1f2d6b..2364e7b7b5fb 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -15,6 +15,7 @@ using namespace llvm;
AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4;
+ StackGrowsUp = true;
HasSingleParameterDotFile = false;
//===------------------------------------------------------------------===//
MinInstAlignment = 4;
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f9d258f44a62..b0f0bf04a891 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -81,6 +81,11 @@ using namespace llvm;
#define DEBUG_TYPE "si-fix-sgpr-copies"
+static cl::opt<bool> EnableM0Merge(
+ "amdgpu-enable-merge-m0",
+ cl::desc("Merge and hoist M0 initializations"),
+ cl::init(false));
+
namespace {
class SIFixSGPRCopies : public MachineFunctionPass {
@@ -108,7 +113,7 @@ public:
INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
"SI Fix SGPR copies", false, false)
@@ -332,27 +337,186 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
return true;
}
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
- const TargetRegisterInfo *TRI) {
- DenseSet<MachineBasicBlock*> Visited;
+template <class UnaryPredicate>
+bool searchPredecessors(const MachineBasicBlock *MBB,
+ const MachineBasicBlock *CutOff,
+ UnaryPredicate Predicate) {
+
+ if (MBB == CutOff)
+ return false;
+
+ DenseSet<const MachineBasicBlock*> Visited;
SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(),
MBB->pred_end());
while (!Worklist.empty()) {
- MachineBasicBlock *mbb = Worklist.back();
- Worklist.pop_back();
+ MachineBasicBlock *MBB = Worklist.pop_back_val();
- if (!Visited.insert(mbb).second)
+ if (!Visited.insert(MBB).second)
+ continue;
+ if (MBB == CutOff)
continue;
- if (hasTerminatorThatModifiesExec(*mbb, *TRI))
+ if (Predicate(MBB))
return true;
- Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end());
+ Worklist.append(MBB->pred_begin(), MBB->pred_end());
}
return false;
}
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+ const TargetRegisterInfo *TRI) {
+ return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
+ return hasTerminatorThatModifiesExec(*MBB, *TRI); });
+}
+
+// Checks if there is potential path From instruction To instruction.
+// If CutOff is specified and it sits in between of that path we ignore
+// a higher portion of the path and report it is not reachable.
+static bool isReachable(const MachineInstr *From,
+ const MachineInstr *To,
+ const MachineBasicBlock *CutOff,
+ MachineDominatorTree &MDT) {
+ // If either From block dominates To block or instructions are in the same
+ // block and From is higher.
+ if (MDT.dominates(From, To))
+ return true;
+
+ const MachineBasicBlock *MBBFrom = From->getParent();
+ const MachineBasicBlock *MBBTo = To->getParent();
+ if (MBBFrom == MBBTo)
+ return false;
+
+ // Instructions are in different blocks, do predecessor search.
+ // We should almost never get here since we do not usually produce M0 stores
+ // other than -1.
+ return searchPredecessors(MBBTo, CutOff, [MBBFrom]
+ (const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
+}
+
+// Hoist and merge identical SGPR initializations into a common predecessor.
+// This is intended to combine M0 initializations, but can work with any
+// SGPR. A VGPR cannot be processed since we cannot guarantee vector
+// executioon.
+static bool hoistAndMergeSGPRInits(unsigned Reg,
+ const MachineRegisterInfo &MRI,
+ MachineDominatorTree &MDT) {
+ // List of inits by immediate value.
+ typedef std::map<unsigned, std::list<MachineInstr*>> InitListMap;
+ InitListMap Inits;
+ // List of clobbering instructions.
+ SmallVector<MachineInstr*, 8> Clobbers;
+ bool Changed = false;
+
+ for (auto &MI : MRI.def_instructions(Reg)) {
+ MachineOperand *Imm = nullptr;
+ for (auto &MO: MI.operands()) {
+ if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
+ (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
+ Imm = nullptr;
+ break;
+ } else if (MO.isImm())
+ Imm = &MO;
+ }
+ if (Imm)
+ Inits[Imm->getImm()].push_front(&MI);
+ else
+ Clobbers.push_back(&MI);
+ }
+
+ for (auto &Init : Inits) {
+ auto &Defs = Init.second;
+
+ for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
+ MachineInstr *MI1 = *I1;
+
+ for (auto I2 = std::next(I1); I2 != E; ) {
+ MachineInstr *MI2 = *I2;
+
+ // Check any possible interference
+ auto intereferes = [&](MachineBasicBlock::iterator From,
+ MachineBasicBlock::iterator To) -> bool {
+
+ assert(MDT.dominates(&*To, &*From));
+
+ auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
+ const MachineBasicBlock *MBBFrom = From->getParent();
+ const MachineBasicBlock *MBBTo = To->getParent();
+ bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
+ bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
+ if (!MayClobberFrom && !MayClobberTo)
+ return false;
+ if ((MayClobberFrom && !MayClobberTo) ||
+ (!MayClobberFrom && MayClobberTo))
+ return true;
+ // Both can clobber, this is not an interference only if both are
+ // dominated by Clobber and belong to the same block or if Clobber
+ // properly dominates To, given that To >> From, so it dominates
+ // both and located in a common dominator.
+ return !((MBBFrom == MBBTo &&
+ MDT.dominates(Clobber, &*From) &&
+ MDT.dominates(Clobber, &*To)) ||
+ MDT.properlyDominates(Clobber->getParent(), MBBTo));
+ };
+
+ return (any_of(Clobbers, interferes)) ||
+ (any_of(Inits, [&](InitListMap::value_type &C) {
+ return C.first != Init.first && any_of(C.second, interferes);
+ }));
+ };
+
+ if (MDT.dominates(MI1, MI2)) {
+ if (!intereferes(MI2, MI1)) {
+ DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber()
+ << " " << *MI2);
+ MI2->eraseFromParent();
+ Defs.erase(I2++);
+ Changed = true;
+ continue;
+ }
+ } else if (MDT.dominates(MI2, MI1)) {
+ if (!intereferes(MI1, MI2)) {
+ DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
+ << " " << *MI1);
+ MI1->eraseFromParent();
+ Defs.erase(I1++);
+ Changed = true;
+ break;
+ }
+ } else {
+ auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
+ MI2->getParent());
+ if (!MBB) {
+ ++I2;
+ continue;
+ }
+
+ MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
+ if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
+ DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
+ << " " << *MI1 << "and moving from BB#"
+ << MI2->getParent()->getNumber() << " to BB#"
+ << I->getParent()->getNumber() << " " << *MI2);
+ I->getParent()->splice(I, MI2->getParent(), MI2);
+ MI1->eraseFromParent();
+ Defs.erase(I1++);
+ Changed = true;
+ break;
+ }
+ }
+ ++I2;
+ }
+ ++I1;
+ }
+ }
+
+ if (Changed)
+ MRI.clearKillFlags(Reg);
+
+ return Changed;
+}
+
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -485,5 +649,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
}
+ if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
+ hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT);
+
return true;
}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index abe6af9a6d3f..86e3b37b09e9 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -101,10 +101,12 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
// We need to insert initialization of the scratch resource descriptor.
unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
- if (ScratchRsrcReg == AMDGPU::NoRegister)
+ if (ScratchRsrcReg == AMDGPU::NoRegister ||
+ !MRI.isPhysRegUsed(ScratchRsrcReg))
return AMDGPU::NoRegister;
if (ST.hasSGPRInitBug() ||
@@ -122,8 +124,6 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
// We find the resource first because it has an alignment requirement.
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
@@ -143,24 +143,34 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
return ScratchRsrcReg;
}
-unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
+// Shift down registers reserved for the scratch wave offset and stack pointer
+// SGPRs.
+std::pair<unsigned, unsigned>
+SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
const SISubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
- if (ST.hasSGPRInitBug() ||
- ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF))
- return ScratchWaveOffsetReg;
- unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
- MachineRegisterInfo &MRI = MF.getRegInfo();
+ // No replacement necessary.
+ if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
+ !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
+ assert(MFI->getStackPtrOffsetReg() == AMDGPU::NoRegister);
+ return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
+ }
+
+ unsigned SPReg = MFI->getStackPtrOffsetReg();
+ if (ST.hasSGPRInitBug())
+ return std::make_pair(ScratchWaveOffsetReg, SPReg);
+
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
if (NumPreloaded > AllSGPRs.size())
- return ScratchWaveOffsetReg;
+ return std::make_pair(ScratchWaveOffsetReg, SPReg);
AllSGPRs = AllSGPRs.slice(NumPreloaded);
@@ -175,26 +185,42 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
// register from the list to consider, it means that when this
// register is being used for the scratch wave offset and there
// are no other free SGPRs, then the value will stay in this register.
+ // + 1 if stack pointer is used.
// ----
- // 13
- if (AllSGPRs.size() < 13)
- return ScratchWaveOffsetReg;
+ // 13 (+1)
+ unsigned ReservedRegCount = 13;
+ if (SPReg != AMDGPU::NoRegister)
+ ++ReservedRegCount;
- for (MCPhysReg Reg : AllSGPRs.drop_back(13)) {
+ if (AllSGPRs.size() < ReservedRegCount)
+ return std::make_pair(ScratchWaveOffsetReg, SPReg);
+
+ bool HandledScratchWaveOffsetReg =
+ ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+ for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
// Pick the first unallocated SGPR. Be careful not to pick an alias of the
// scratch descriptor, since we haven’t added its uses yet.
- if (!MRI.isPhysRegUsed(Reg)) {
- if (!MRI.isAllocatable(Reg) ||
- TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
- continue;
+ if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+ if (!HandledScratchWaveOffsetReg) {
+ HandledScratchWaveOffsetReg = true;
- MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
- MFI->setScratchWaveOffsetReg(Reg);
- return Reg;
+ MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+ MFI->setScratchWaveOffsetReg(Reg);
+ ScratchWaveOffsetReg = Reg;
+ } else {
+ if (SPReg == AMDGPU::NoRegister)
+ break;
+
+ MRI.replaceRegWith(SPReg, Reg);
+ MFI->setStackPtrOffsetReg(Reg);
+ SPReg = Reg;
+ break;
+ }
}
}
- return ScratchWaveOffsetReg;
+ return std::make_pair(ScratchWaveOffsetReg, SPReg);
}
void SIFrameLowering::emitPrologue(MachineFunction &MF,
@@ -220,18 +246,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned ScratchRsrcReg
- = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
- unsigned ScratchWaveOffsetReg
- = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
-
- if (ScratchRsrcReg == AMDGPU::NoRegister) {
- assert(ScratchWaveOffsetReg == AMDGPU::NoRegister);
- return;
- }
-
- assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
-
// We need to do the replacement of the private segment buffer and wave offset
// register even if there are no stack objects. There could be stores to undef
// or a constant without an associated object.
@@ -244,19 +258,49 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
emitFlatScratchInit(ST, MF, MBB);
+ unsigned SPReg = MFI->getStackPtrOffsetReg();
+ if (SPReg != AMDGPU::NoRegister) {
+ DebugLoc DL;
+ int64_t StackSize = MF.getFrameInfo().getStackSize();
+
+ if (StackSize == 0) {
+ BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ } else {
+ BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(StackSize * ST.getWavefrontSize());
+ }
+ }
+
+ unsigned ScratchRsrcReg
+ = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
+
+ unsigned ScratchWaveOffsetReg;
+ std::tie(ScratchWaveOffsetReg, SPReg)
+ = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
+
+ // It's possible to have uses of only ScratchWaveOffsetReg without
+ // ScratchRsrcReg if it's only used for the initialization of flat_scratch,
+ // but the inverse is not true.
+ if (ScratchWaveOffsetReg == AMDGPU::NoRegister) {
+ assert(ScratchRsrcReg == AMDGPU::NoRegister);
+ return;
+ }
+
// We need to insert initialization of the scratch resource descriptor.
unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
PreloadedPrivateBufferReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}
- bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg);
- bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg);
+ bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
+ bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
+ MRI.isPhysRegUsed(ScratchRsrcReg);
// We added live-ins during argument lowering, but since they were not used
// they were deleted. We're adding the uses now, so add them back.
@@ -469,7 +513,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
// this also ensures we shouldn't need a register for the offset when
// emergency scavenging.
int ScavengeFI = MFI.CreateFixedObject(
- AMDGPU::SGPR_32RegClass.getSize(), 0, false);
+ TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
RS->addScavengingFrameIndex(ScavengeFI);
}
}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index 1bfc08093da2..7ccd02b3c86a 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -49,7 +49,7 @@ private:
SIMachineFunctionInfo *MFI,
MachineFunction &MF) const;
- unsigned getReservedPrivateSegmentWaveByteOffsetReg(
+ std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg(
const SISubtarget &ST,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index dd867b15b4c7..ce74a7cd8b04 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -287,8 +287,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// On SI this is s_memtime and s_memrealtime on VI.
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
- setOperationAction(ISD::TRAP, MVT::Other, Legal);
- setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+ setOperationAction(ISD::TRAP, MVT::Other, Custom);
+ setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
@@ -1644,7 +1644,7 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
const TargetRegisterClass *SuperRC,
unsigned VecReg,
int Offset) {
- int NumElts = SuperRC->getSize() / 4;
+ int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
// Skip out of bounds offsets, or else we would end up using an undefined
// register.
@@ -1793,17 +1793,18 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
return LoopBB;
}
-static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) {
- switch (VecRC->getSize()) {
- case 4:
+static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
+ const TargetRegisterClass *VecRC) {
+ switch (TRI.getRegSizeInBits(*VecRC)) {
+ case 32: // 4 bytes
return AMDGPU::V_MOVRELD_B32_V1;
- case 8:
+ case 64: // 8 bytes
return AMDGPU::V_MOVRELD_B32_V2;
- case 16:
+ case 128: // 16 bytes
return AMDGPU::V_MOVRELD_B32_V4;
- case 32:
+ case 256: // 32 bytes
return AMDGPU::V_MOVRELD_B32_V8;
- case 64:
+ case 512: // 64 bytes
return AMDGPU::V_MOVRELD_B32_V16;
default:
llvm_unreachable("unsupported size for MOVRELD pseudos");
@@ -1863,7 +1864,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
- const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+ const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
BuildMI(MBB, I, DL, MovRelDesc)
.addReg(Dst, RegState::Define)
@@ -1907,7 +1908,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
.addReg(PhiReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
} else {
- const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+ const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
.addReg(Dst, RegState::Define)
@@ -1948,50 +1949,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
}
switch (MI.getOpcode()) {
- case AMDGPU::S_TRAP_PSEUDO: {
- const DebugLoc &DL = MI.getDebugLoc();
- const int TrapType = MI.getOperand(0).getImm();
-
- if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
- Subtarget->isTrapHandlerEnabled()) {
-
- MachineFunction *MF = BB->getParent();
- SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
- unsigned UserSGPR = Info->getQueuePtrUserSGPR();
- assert(UserSGPR != AMDGPU::NoRegister);
-
- if (!BB->isLiveIn(UserSGPR))
- BB->addLiveIn(UserSGPR);
-
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::SGPR0_SGPR1)
- .addReg(UserSGPR);
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_TRAP))
- .addImm(TrapType)
- .addReg(AMDGPU::SGPR0_SGPR1, RegState::Implicit);
- } else {
- switch (TrapType) {
- case SISubtarget::TrapIDLLVMTrap:
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_ENDPGM));
- break;
- case SISubtarget::TrapIDLLVMDebugTrap: {
- DiagnosticInfoUnsupported NoTrap(*MF->getFunction(),
- "debugtrap handler not supported",
- DL,
- DS_Warning);
- LLVMContext &C = MF->getFunction()->getContext();
- C.diagnose(NoTrap);
- BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_NOP))
- .addImm(0);
- break;
- }
- default:
- llvm_unreachable("unsupported trap handler type!");
- }
- }
-
- MI.eraseFromParent();
- return BB;
- }
case AMDGPU::SI_INIT_M0:
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
@@ -2163,6 +2120,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);
+
+ case ISD::TRAP:
+ case ISD::DEBUGTRAP:
+ return lowerTRAP(Op, DAG);
}
return SDValue();
}
@@ -2431,6 +2392,57 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);;
}
+SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Chain = Op.getOperand(0);
+
+ unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
+ SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
+
+ if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
+ Subtarget->isTrapHandlerEnabled()) {
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+ assert(UserSGPR != AMDGPU::NoRegister);
+
+ SDValue QueuePtr = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+ SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
+
+ SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
+ QueuePtr, SDValue());
+
+ SDValue Ops[] = {
+ ToReg,
+ DAG.getTargetConstant(TrapID, SL, MVT::i16),
+ SGPR01,
+ ToReg.getValue(1)
+ };
+
+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+ }
+
+ switch (TrapID) {
+ case SISubtarget::TrapIDLLVMTrap:
+ return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
+ case SISubtarget::TrapIDLLVMDebugTrap: {
+ DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+ "debugtrap handler not supported",
+ Op.getDebugLoc(),
+ DS_Warning);
+ LLVMContext &Ctx = MF.getFunction()->getContext();
+ Ctx.diagnose(NoTrap);
+ return Chain;
+ }
+ default:
+ llvm_unreachable("unsupported trap handler type!");
+ }
+
+ return Chain;
+}
+
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const {
// FIXME: Use inline constants (src_{shared, private}_base) instead.
@@ -3410,9 +3422,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
EVT VT = Op.getValueType();
bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
+ if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
+ return SDValue();
+
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
- if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
- VT == MVT::f16) {
+ if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
if (CLHS->isExactlyValue(1.0)) {
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
// the CI documentation has a worst case error of 1 ulp.
@@ -4696,7 +4710,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+ if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
DCI.CommitTargetLoweringOpt(TLO);
}
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c2a3e62aa827..9122cd72d323 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -428,8 +428,8 @@ RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
const MachineInstr &MIA = *MI;
const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
- unsigned Size = RC->getSize();
- Result.second = Result.first + (Size / 4);
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ Result.second = Result.first + (Size / 32);
return Result;
}
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index 47257ce16ceb..9f32ecfa52ff 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -216,8 +216,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
// XXX - What if this is a write into a super register?
const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
- unsigned Size = RC->getSize();
- Result.Named.LGKM = Size > 4 ? 2 : 1;
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ Result.Named.LGKM = Size > 32 ? 2 : 1;
} else {
// s_dcache_inv etc. do not have a a destination register. Assume we
// want a wait on these.
@@ -289,12 +289,12 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
const MachineOperand &Reg) const {
- unsigned Size = RC->getSize();
- assert(Size >= 4);
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ assert(Size >= 32);
RegInterval Result;
Result.first = TRI->getEncodingValue(Reg.getReg());
- Result.second = Result.first + Size / 4;
+ Result.second = Result.first + Size / 32;
return Result;
}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 05ac67d26620..92e452a3d6a0 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -138,6 +138,11 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
}
if (isSMRD(Opc0) && isSMRD(Opc1)) {
+ // Skip time and cache invalidation instructions.
+ if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
+ AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
+ return false;
+
assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
// Check base reg.
@@ -245,11 +250,11 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
unsigned EltSize;
if (LdSt.mayLoad())
- EltSize = getOpRegClass(LdSt, 0)->getSize() / 2;
+ EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
else {
assert(LdSt.mayStore());
int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
- EltSize = getOpRegClass(LdSt, Data0Idx)->getSize();
+ EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
}
if (isStride64(Opc))
@@ -345,7 +350,7 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
FirstLdSt.getParent()->getParent()->getRegInfo();
const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
- return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
+ return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
}
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
@@ -433,7 +438,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isSGPRClass(RC)) {
- if (RC->getSize() > 4) {
+ if (RI.getRegSizeInBits(*RC) > 32) {
Opcode = AMDGPU::S_MOV_B64;
EltSize = 8;
} else {
@@ -493,11 +498,11 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
- if (DstRC->getSize() == 4) {
+ if (RI.getRegSizeInBits(*DstRC) == 32) {
return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
- } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+ } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
return AMDGPU::S_MOV_B64;
- } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
+ } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
return AMDGPU::V_MOV_B64_PSEUDO;
}
return AMDGPU::COPY;
@@ -557,17 +562,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineMemOperand *MMO
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
Size, Align);
+ unsigned SpillSize = TRI->getSpillSize(*RC);
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
// We are only allowed to create one new instruction when spilling
// registers, so we need to use pseudo instruction for spilling SGPRs.
- const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize()));
+ const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
// The SGPR spill/restore instructions only work on number sgprs, so we need
// to make sure we are using the correct register class.
- if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
}
@@ -602,7 +608,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
- unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
+ unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
MFI->setHasSpilledVGPRs();
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg, getKillRegState(isKill)) // data
@@ -660,6 +666,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
unsigned Size = FrameInfo.getObjectSize(FrameIndex);
+ unsigned SpillSize = TRI->getSpillSize(*RC);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
@@ -670,8 +677,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
if (RI.isSGPRClass(RC)) {
// FIXME: Maybe this should not include a memoperand because it will be
// lowered to non-memory instructions.
- const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize()));
- if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
+ const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
+ if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
@@ -701,7 +708,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
- unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
+ unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
@@ -1440,9 +1447,9 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
- unsigned DstSize = DstRC->getSize();
+ unsigned DstSize = RI.getRegSizeInBits(*DstRC);
- if (DstSize == 4) {
+ if (DstSize == 32) {
unsigned SelOp = Pred == SCC_TRUE ?
AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
@@ -1456,7 +1463,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
return;
}
- if (DstSize == 8 && Pred == SCC_TRUE) {
+ if (DstSize == 64 && Pred == SCC_TRUE) {
MachineInstr *Select =
BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
.addReg(FalseReg)
@@ -1483,7 +1490,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
const int16_t *SubIndices = Sub0_15;
- int NElts = DstSize / 4;
+ int NElts = DstSize / 32;
// 64-bit select is only avaialble for SALU.
if (Pred == SCC_TRUE) {
@@ -2635,6 +2642,19 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
return;
+ // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
+ // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
+ // select is uniform.
+ if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
+ RI.isVGPR(MRI, Src1.getReg())) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src1);
+ Src1.ChangeToRegister(Reg, false);
+ return;
+ }
+
// We do not use commuteInstruction here because it is too aggressive and will
// commute if it is possible. We only want to commute here if it improves
// legality. This can be called a fairly large number of times so don't waste
@@ -2729,7 +2749,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
unsigned DstReg = MRI.createVirtualRegister(SRC);
- unsigned SubRegs = VRC->getSize() / 4;
+ unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
SmallVector<unsigned, 8> SRegs;
for (unsigned i = 0; i < SubRegs; ++i) {
@@ -3595,7 +3615,7 @@ void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
.addImm(16)
.add(Src0);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
- .addImm(0xffff);
+ .addImm(0xffff0000);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
.add(Src1)
.addReg(ImmReg, RegState::Kill)
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 659473ca6a47..03a5ef74b179 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -626,13 +626,13 @@ public:
return 4;
}
- return RI.getRegClass(OpInfo.RegClass)->getSize();
+ return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8;
}
/// \brief This form should usually be preferred since it handles operands
/// with unknown register classes.
unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
- return getOpRegClass(MI, OpNo)->getSize();
+ return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
}
/// \returns true if it is legal for the operand at index \p OpNo
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index c6daf743f3ac..7b052844f177 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -646,11 +646,10 @@ def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
-def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
-def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">;
+def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">;
// VOP3Mods, but the input source is known to never be NaN.
def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 2f89503e129a..3f6ddec70479 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -94,6 +94,12 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
+def ATOMIC_FENCE : SPseudoInstSI<
+ (outs), (ins i32imm:$ordering, i32imm:$scope),
+ [(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))],
+ "ATOMIC_FENCE $ordering, $scope"> {
+ let hasSideEffects = 1;
+}
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
@@ -111,12 +117,6 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
-def S_TRAP_PSEUDO : SPseudoInstSI <(outs), (ins i16imm:$simm16)> {
- let hasSideEffects = 1;
- let SALU = 1;
- let usesCustomInserter = 1;
-}
-
let usesCustomInserter = 1, SALU = 1 in {
def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
@@ -400,13 +400,8 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
let Predicates = [isGCN] in {
def : Pat<
- (trap),
- (S_TRAP_PSEUDO TRAPID.LLVM_TRAP)
->;
-
-def : Pat<
- (debugtrap),
- (S_TRAP_PSEUDO TRAPID.LLVM_DEBUG_TRAP)
+ (AMDGPUtrap timm:$trapid),
+ (S_TRAP $trapid)
>;
def : Pat<
@@ -477,8 +472,8 @@ def : Pat <
// fp_to_fp16 patterns
def : Pat <
- (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))),
- (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod)
+ (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
def : Pat <
@@ -507,11 +502,11 @@ def : Pat <
multiclass FMADPat <ValueType vt, Instruction inst> {
def : Pat <
- (vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
- (VOP3NoMods vt:$src1, i32:$src1_modifiers),
- (VOP3NoMods vt:$src2, i32:$src2_modifiers))),
- (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
- $src2_modifiers, $src2, $clamp, $omod)
+ (vt (fmad (VOP3NoMods vt:$src0),
+ (VOP3NoMods vt:$src1),
+ (VOP3NoMods vt:$src2))),
+ (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
}
@@ -681,10 +676,9 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
// If denormals are not enabled, it only impacts the compare of the
// inputs. The output result is not flushed.
class ClampPat<Instruction inst, ValueType vt> : Pat <
- (vt (AMDGPUclamp
- (VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))),
+ (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))),
(inst i32:$src0_modifiers, vt:$src0,
- i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod)
+ i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE)
>;
def : ClampPat<V_MAX_F32_e64, f32>;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8e612d2ddfda..b6a982aee6be 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -25,6 +25,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
TIDReg(AMDGPU::NoRegister),
ScratchRSrcReg(AMDGPU::NoRegister),
ScratchWaveOffsetReg(AMDGPU::NoRegister),
+ FrameOffsetReg(AMDGPU::NoRegister),
+ StackPtrOffsetReg(AMDGPU::NoRegister),
PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
DispatchPtrUserSGPR(AMDGPU::NoRegister),
QueuePtrUserSGPR(AMDGPU::NoRegister),
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 810fb05984c4..dc9f509e60ae 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -88,6 +88,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned ScratchRSrcReg;
unsigned ScratchWaveOffsetReg;
+ // This is the current function's incremented size from the kernel's scratch
+ // wave offset register. For an entry function, this is exactly the same as
+ // the ScratchWaveOffsetReg.
+ unsigned FrameOffsetReg;
+
+ // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg.
+ unsigned StackPtrOffsetReg;
+
// Input registers for non-HSA ABI
unsigned PrivateMemoryPtrUserSGPR;
@@ -364,9 +372,25 @@ public:
return ScratchWaveOffsetReg;
}
+ unsigned getFrameOffsetReg() const {
+ return FrameOffsetReg;
+ }
+
+ void setStackPtrOffsetReg(unsigned Reg) {
+ assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ StackPtrOffsetReg = Reg;
+ }
+
+ unsigned getStackPtrOffsetReg() const {
+ return StackPtrOffsetReg;
+ }
+
void setScratchWaveOffsetReg(unsigned Reg) {
assert(Reg != AMDGPU::NoRegister && "Should never be unset");
ScratchWaveOffsetReg = Reg;
+
+ // FIXME: Only for entry functions.
+ FrameOffsetReg = ScratchWaveOffsetReg;
}
unsigned getQueuePtrUserSGPR() const {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 098c67252dd8..8820e294562b 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -146,6 +146,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
+ // M0 has to be reserved so that llvm accepts it as a live-in into a block.
+ reserveRegisterTuples(Reserved, AMDGPU::M0);
+
// Reserve the memory aperture registers.
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
@@ -615,7 +618,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (SpillToSMEM && isSGPRClass(RC)) {
// XXX - if private_element_size is larger than 4 it might be useful to be
// able to spill wider vmem spills.
- std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true);
+ std::tie(EltSize, ScalarStoreOp) =
+ getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
}
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
@@ -775,7 +779,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
if (SpillToSMEM && isSGPRClass(RC)) {
// XXX - if private_element_size is larger than 4 it might be useful to be
// able to spill wider vmem spills.
- std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false);
+ std::tie(EltSize, ScalarLoadOp) =
+ getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
}
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
@@ -1038,20 +1043,21 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
// TODO: It might be helpful to have some target specific flags in
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
- switch (RC->getSize()) {
- case 0: return false;
- case 1: return false;
- case 4:
+ unsigned Size = getRegSizeInBits(*RC);
+ if (Size < 32)
+ return false;
+ switch (Size) {
+ case 32:
return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
- case 8:
+ case 64:
return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
- case 12:
+ case 96:
return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
- case 16:
+ case 128:
return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
- case 32:
+ case 256:
return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
- case 64:
+ case 512:
return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
default:
llvm_unreachable("Invalid register class size");
@@ -1060,18 +1066,18 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
const TargetRegisterClass *SRC) const {
- switch (SRC->getSize()) {
- case 4:
+ switch (getRegSizeInBits(*SRC)) {
+ case 32:
return &AMDGPU::VGPR_32RegClass;
- case 8:
+ case 64:
return &AMDGPU::VReg_64RegClass;
- case 12:
+ case 96:
return &AMDGPU::VReg_96RegClass;
- case 16:
+ case 128:
return &AMDGPU::VReg_128RegClass;
- case 32:
+ case 256:
return &AMDGPU::VReg_256RegClass;
- case 64:
+ case 512:
return &AMDGPU::VReg_512RegClass;
default:
llvm_unreachable("Invalid register class size");
@@ -1080,16 +1086,16 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
const TargetRegisterClass *VRC) const {
- switch (VRC->getSize()) {
- case 4:
+ switch (getRegSizeInBits(*VRC)) {
+ case 32:
return &AMDGPU::SGPR_32RegClass;
- case 8:
+ case 64:
return &AMDGPU::SReg_64RegClass;
- case 16:
+ case 128:
return &AMDGPU::SReg_128RegClass;
- case 32:
+ case 256:
return &AMDGPU::SReg_256RegClass;
- case 64:
+ case 512:
return &AMDGPU::SReg_512RegClass;
default:
llvm_unreachable("Invalid register class size");
@@ -1354,15 +1360,15 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
const TargetRegisterClass *DstRC,
unsigned DstSubReg,
const TargetRegisterClass *NewRC) const {
- unsigned SrcSize = SrcRC->getSize();
- unsigned DstSize = DstRC->getSize();
- unsigned NewSize = NewRC->getSize();
+ unsigned SrcSize = getRegSizeInBits(*SrcRC);
+ unsigned DstSize = getRegSizeInBits(*DstRC);
+ unsigned NewSize = getRegSizeInBits(*NewRC);
// Do not increase size of registers beyond dword, we would need to allocate
// adjacent registers and constraint regalloc more than needed.
// Always allow dword coalescing.
- if (SrcSize <= 4 || DstSize <= 4)
+ if (SrcSize <= 32 || DstSize <= 32)
return true;
return NewSize <= DstSize || NewSize <= SrcSize;
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index b4adbdd1df07..593439c2a3cd 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -530,14 +530,16 @@ class SOPKInstTable <bit is_sopk, string cmpOp = ""> {
class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
opName,
(outs SReg_32:$sdst),
- (ins u16imm:$simm16),
+ (ins s16imm:$simm16),
"$sdst, $simm16",
pattern>;
-class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo <
+class SOPK_SCC <string opName, string base_op, bit isSignExt> : SOPK_Pseudo <
opName,
(outs),
- (ins SReg_32:$sdst, u16imm:$simm16),
+ !if(isSignExt,
+ (ins SReg_32:$sdst, s16imm:$simm16),
+ (ins SReg_32:$sdst, u16imm:$simm16)),
"$sdst, $simm16", []>,
SOPKInstTable<1, base_op>{
let Defs = [SCC];
@@ -546,7 +548,7 @@ class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo <
class SOPK_32TIE <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
opName,
(outs SReg_32:$sdst),
- (ins SReg_32:$src0, u16imm:$simm16),
+ (ins SReg_32:$src0, s16imm:$simm16),
"$sdst, $simm16",
pattern
>;
@@ -575,20 +577,20 @@ let isCompare = 1 in {
// [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
// >;
-def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32">;
-def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32">;
-def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32">;
-def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32">;
-def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32">;
-def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32">;
+def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32", 1>;
+def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32", 1>;
+def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32", 1>;
+def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32", 1>;
+def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32", 1>;
+def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32", 1>;
let SOPKZext = 1 in {
-def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32">;
-def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32">;
-def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32">;
-def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32">;
-def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32">;
-def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32">;
+def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32", 0>;
+def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32", 0>;
+def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32", 0>;
+def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32", 0>;
+def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>;
+def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>;
} // End SOPKZext = 1
} // End isCompare = 1
@@ -600,7 +602,7 @@ let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
def S_CBRANCH_I_FORK : SOPK_Pseudo <
"s_cbranch_i_fork",
- (outs), (ins SReg_64:$sdst, u16imm:$simm16),
+ (outs), (ins SReg_64:$sdst, s16imm:$simm16),
"$sdst, $simm16"
>;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 86095a8e1142..5a3242bed1d0 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -93,6 +93,12 @@ unsigned getVmcntBitWidthHi() { return 2; }
} // end namespace anonymous
namespace llvm {
+
+static cl::opt<bool> EnablePackedInlinableLiterals(
+ "enable-packed-inlinable-literals",
+ cl::desc("Enable packed inlinable literals (v2f16, v2i16)"),
+ cl::init(false));
+
namespace AMDGPU {
namespace IsaInfo {
@@ -703,6 +709,9 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
assert(HasInv2Pi);
+ if (!EnablePackedInlinableLiterals)
+ return false;
+
int16_t Lo16 = static_cast<int16_t>(Literal);
int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);