summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp17
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td5
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td1
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp26
-rw-r--r--lib/Target/AMDGPU/MIMGInstructions.td26
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp147
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h13
-rw-r--r--lib/Target/AMDGPU/SIInsertSkips.cpp22
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp30
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h3
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td10
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp1
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h9
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td31
14 files changed, 246 insertions, 95 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b201126c593b..21e44e9589d3 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -554,6 +554,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
case ISD::FTRUNC:
case ISD::FRINT:
case ISD::FNEARBYINT:
+ case ISD::FCANONICALIZE:
case AMDGPUISD::RCP:
case AMDGPUISD::RCP_LEGACY:
case AMDGPUISD::RCP_IFLAG:
@@ -907,6 +908,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
LLVMContext &Ctx = Fn.getParent()->getContext();
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
+ CallingConv::ID CC = Fn.getCallingConv();
unsigned MaxAlign = 1;
uint64_t ExplicitArgOffset = 0;
@@ -940,16 +942,10 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
EVT ArgVT = ValueVTs[Value];
EVT MemVT = ArgVT;
- MVT RegisterVT =
- getRegisterTypeForCallingConv(Ctx, ArgVT);
- unsigned NumRegs =
- getNumRegistersForCallingConv(Ctx, ArgVT);
-
- if (!Subtarget->isAmdHsaOS() &&
- (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) {
- // The ABI says the caller will extend these values to 32-bits.
- MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32;
- } else if (NumRegs == 1) {
+ MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
+ unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
+
+ if (NumRegs == 1) {
// This argument is not split, so the IR type is the memory type.
if (ArgVT.isExtended()) {
// We have an extended type, like i24, so we should just use the
@@ -3600,6 +3596,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FRINT:
case ISD::FNEARBYINT: // XXX - Should fround be handled?
case ISD::FSIN:
+ case ISD::FCANONICALIZE:
case AMDGPUISD::RCP:
case AMDGPUISD::RCP_LEGACY:
case AMDGPUISD::RCP_IFLAG:
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 96b7568eec1f..7442a59e594f 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -342,8 +342,9 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
- SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
- SDTCisFP<0>, SDTCisVec<1>]>,
+ SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
+ SDTCisFP<0>, SDTCisVec<1>,
+ SDTCisInt<4>]>,
[]>;
def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 9426df399597..c9c932ef2f5f 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -567,6 +567,7 @@ int PI = 0x40490fdb;
int TWO_PI_INV = 0x3e22f983;
int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
int FP16_ONE = 0x3C00;
+int FP16_NEG_ONE = 0xBC00;
int V2FP16_ONE = 0x3C003C00;
int FP32_ONE = 0x3f800000;
int FP32_NEG_ONE = 0xbf800000;
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 8cc7e38f7b29..c147830e12ed 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -100,16 +100,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
unsigned Size = DL.getTypeSizeInBits(ArgTy);
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
-
- // Clover seems to always pad i8/i16 to i32, but doesn't properly align
- // them?
- // Make sure the struct elements have correct size and alignment for ext
- // args. These seem to be padded up to 4-bytes but not correctly aligned.
- bool IsExtArg = AllocSize < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
- !ST.isAmdHsaOS();
- if (IsExtArg)
- AllocSize = 4;
-
uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
@@ -164,8 +154,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
ArgPtr->getName() + ".cast");
}
- assert((!IsExtArg || !IsV3) && "incompatible situation");
-
if (IsV3 && Size >= 32) {
V4Ty = VectorType::get(VT->getVectorElementType(), 4);
// Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
@@ -212,20 +200,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
// TODO: Convert noalias arg to !noalias
if (Size < 32 && !ArgTy->isAggregateType()) {
- if (IsExtArg && OffsetDiff == 0) {
- Type *I32Ty = Builder.getInt32Ty();
- bool IsSext = Arg.hasSExtAttr();
- Metadata *LowAndHigh[] = {
- ConstantAsMetadata::get(
- ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)),
- ConstantAsMetadata::get(
- ConstantInt::get(I32Ty,
- IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1))
- };
-
- Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh));
- }
-
Value *ExtractBits = OffsetDiff == 0 ?
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 1e0bc62c45a6..44c2d366e461 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -66,6 +66,22 @@ def MIMGDimInfoTable : GenericTable {
let PrimaryKeyName = "getMIMGDimInfo";
}
+class MIMGLZMapping<MIMGBaseOpcode l, MIMGBaseOpcode lz> {
+ MIMGBaseOpcode L = l;
+ MIMGBaseOpcode LZ = lz;
+}
+
+def MIMGLZMappingTable : GenericTable {
+ let FilterClass = "MIMGLZMapping";
+ let CppTypeName = "MIMGLZMappingInfo";
+ let Fields = ["L", "LZ"];
+ GenericEnum TypeOf_L = MIMGBaseOpcode;
+ GenericEnum TypeOf_LZ = MIMGBaseOpcode;
+
+ let PrimaryKey = ["L"];
+ let PrimaryKeyName = "getMIMGLZMappingInfo";
+}
+
class mimg <bits<7> si, bits<7> vi = si> {
field bits<7> SI = si;
field bits<7> VI = vi;
@@ -547,3 +563,13 @@ foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
AMDGPUImageDimAtomicIntrinsics) in {
def : ImageDimIntrinsicInfo<intr>;
}
+
+// L to LZ Optimization Mapping
+def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>;
+def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>;
+def : MIMGLZMapping<IMAGE_SAMPLE_L_O, IMAGE_SAMPLE_LZ_O>;
+def : MIMGLZMapping<IMAGE_SAMPLE_C_L_O, IMAGE_SAMPLE_C_LZ_O>;
+def : MIMGLZMapping<IMAGE_GATHER4_L, IMAGE_GATHER4_LZ>;
+def : MIMGLZMapping<IMAGE_GATHER4_C_L, IMAGE_GATHER4_C_LZ>;
+def : MIMGLZMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_LZ_O>;
+def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b7fc2656a20..25007861fd15 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -694,6 +694,87 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
return false;
}
+MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ // TODO: Consider splitting all arguments into 32-bit pieces.
+ if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+ EVT ScalarVT = VT.getScalarType();
+ unsigned Size = ScalarVT.getSizeInBits();
+ if (Size == 32)
+ return ScalarVT.getSimpleVT();
+
+ if (Size == 64)
+ return MVT::i32;
+
+ if (Size == 16 &&
+ Subtarget->has16BitInsts() &&
+ isPowerOf2_32(VT.getVectorNumElements()))
+ return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ }
+
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
+unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT ScalarVT = VT.getScalarType();
+ unsigned Size = ScalarVT.getSizeInBits();
+
+ if (Size == 32)
+ return NumElts;
+
+ if (Size == 64)
+ return 2 * NumElts;
+
+ // FIXME: Fails to break down as we want with v3.
+ if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts))
+ return VT.getVectorNumElements() / 2;
+ }
+
+ return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+}
+
+unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC,
+ EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const {
+ if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT ScalarVT = VT.getScalarType();
+ unsigned Size = ScalarVT.getSizeInBits();
+ if (Size == 32) {
+ RegisterVT = ScalarVT.getSimpleVT();
+ IntermediateVT = RegisterVT;
+ NumIntermediates = NumElts;
+ return NumIntermediates;
+ }
+
+ if (Size == 64) {
+ RegisterVT = MVT::i32;
+ IntermediateVT = RegisterVT;
+ NumIntermediates = 2 * NumElts;
+ return NumIntermediates;
+ }
+
+ // FIXME: We should fix the ABI to be the same on targets without 16-bit
+ // support, but unless we can properly handle 3-vectors, it will be still be
+ // inconsistent.
+ if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) {
+ RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ IntermediateVT = RegisterVT;
+ NumIntermediates = NumElts / 2;
+ return NumIntermediates;
+ }
+ }
+
+ return TargetLowering::getVectorTypeBreakdownForCallingConv(
+ Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -1268,6 +1349,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
const ISD::InputArg *Arg = &Ins[I];
+ assert(!Arg->VT.isVector() && "vector type argument should have been split");
+
// First check if it's a PS input addr.
if (CallConv == CallingConv::AMDGPU_PS &&
!Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
@@ -1301,25 +1384,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
++PSInputNum;
}
- // Second split vertices into their elements.
- if (Arg->VT.isVector()) {
- ISD::InputArg NewArg = *Arg;
- NewArg.Flags.setSplit();
- NewArg.VT = Arg->VT.getVectorElementType();
-
- // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
- // three or five element vertex only needs three or five registers,
- // NOT four or eight.
- Type *ParamType = FType->getParamType(Arg->getOrigArgIndex());
- unsigned NumElements = ParamType->getVectorNumElements();
-
- for (unsigned J = 0; J != NumElements; ++J) {
- Splits.push_back(NewArg);
- NewArg.PartOffset += NewArg.VT.getStoreSize();
- }
- } else {
- Splits.push_back(*Arg);
- }
+ Splits.push_back(*Arg);
}
}
@@ -4490,6 +4555,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
+ const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
+ AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
+ unsigned IntrOpcode = Intr->BaseOpcode;
SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
bool IsD16 = false;
@@ -4575,6 +4643,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SmallVector<SDValue, 4> VAddrs;
for (unsigned i = 0; i < NumVAddrs; ++i)
VAddrs.push_back(Op.getOperand(AddrIdx + i));
+
+ // Optimize _L to _LZ when _L is zero
+ if (LZMappingInfo) {
+ if (auto ConstantLod =
+ dyn_cast<ConstantFPSDNode>(VAddrs[NumVAddrs-1].getNode())) {
+ if (ConstantLod->isZero() || ConstantLod->isNegative()) {
+ IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
+ VAddrs.pop_back(); // remove 'lod'
+ }
+ }
+ }
+
SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
@@ -4634,10 +4714,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
int Opcode = -1;
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
- Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8,
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
- Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6,
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
NumVDataDwords, NumVAddrDwords);
assert(Opcode != -1);
@@ -4945,7 +5025,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_fdot2:
return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(4));
case Intrinsic::amdgcn_fmul_legacy:
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
Op.getOperand(1), Op.getOperand(2));
@@ -6754,10 +6835,6 @@ static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
ST->hasFP16Denormals();
- case ISD::FP16_TO_FP:
- case ISD::FP_TO_FP16:
- return ST->hasFP16Denormals();
-
// It can/will be lowered or combined as a bit operation.
// Need to check their input recursively to handle.
case ISD::FNEG:
@@ -6799,8 +6876,16 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
- ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
+ SDValue N0 = N->getOperand(0);
+ // fcanonicalize undef -> qnan
+ if (N0.isUndef()) {
+ EVT VT = N->getValueType(0);
+ APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
+ return DAG.getConstantFP(QNaN, SDLoc(N), VT);
+ }
+
+ ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0);
if (!CFP) {
SDValue N0 = N->getOperand(0);
EVT VT = N0.getValueType().getScalarType();
@@ -6853,7 +6938,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
}
- return N->getOperand(0);
+ return N0;
}
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
@@ -7544,8 +7629,10 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
return SDValue();
if ((Vec1 == Vec3 && Vec2 == Vec4) ||
- (Vec1 == Vec4 && Vec2 == Vec3))
- return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+ (Vec1 == Vec4 && Vec2 == Vec3)) {
+ return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
+ DAG.getTargetConstant(0, SL, MVT::i1));
+ }
}
return SDValue();
}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index ad049f2a71c3..5b3d49b3d8e3 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -25,6 +25,19 @@ class SITargetLowering final : public AMDGPUTargetLowering {
private:
const GCNSubtarget *Subtarget;
+public:
+ MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const override;
+ unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const override;
+
+ unsigned getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const override;
+
+private:
SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
SDValue Chain, uint64_t Offset) const;
SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index 61c8f359e168..dc9397cf7b85 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -133,28 +133,10 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
return true;
- // V_READFIRSTLANE/V_READLANE destination register may be used as operand
- // by some SALU instruction. If exec mask is zero vector instruction
- // defining the register that is used by the scalar one is not executed
- // and scalar instruction will operate on undefined data. For
- // V_READFIRSTLANE/V_READLANE we should avoid predicated execution.
- if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) ||
- (I->getOpcode() == AMDGPU::V_READLANE_B32)) {
+ if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
return true;
- }
-
- if (I->isInlineAsm()) {
- const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
- const char *AsmStr = I->getOperand(0).getSymbolName();
-
- // inlineasm length estimate is number of bytes assuming the longest
- // instruction.
- uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
- NumInstr += MaxAsmSize / MAI->getMaxInstLength();
- } else {
- ++NumInstr;
- }
+ ++NumInstr;
if (NumInstr >= SkipThreshold)
return true;
}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6c85c92454c3..f3745382a6f4 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2332,6 +2332,36 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
changesVGPRIndexingMode(MI);
}
+bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+
+ if (MI.mayStore() && isSMRD(MI))
+ return true; // scalar store or atomic
+
+ // These instructions cause shader I/O that may cause hardware lockups
+ // when executed with an empty EXEC mask.
+ //
+ // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
+ // EXEC = 0, but checking for that case here seems not worth it
+ // given the typical code patterns.
+ if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
+ Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE)
+ return true;
+
+ if (MI.isInlineAsm())
+ return true; // conservative assumption
+
+ // These are like SALU instructions in terms of effects, so it's questionable
+ // whether we should return true for those.
+ //
+ // However, executing them with EXEC = 0 causes them to operate on undefined
+ // data, which we avoid by returning true here.
+ if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
+ return true;
+
+ return false;
+}
+
bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
switch (Imm.getBitWidth()) {
case 32:
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 0a735257d34e..d681b926504e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -597,6 +597,9 @@ public:
return !RI.isSGPRReg(MRI, Dest);
}
+ /// Whether we must prevent this instruction from executing with EXEC = 0.
+ bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
+
bool isInlineConstant(const APInt &Imm) const;
bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index c3f8bfb53ef4..5c10646161b3 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1387,6 +1387,11 @@ def : GCNPat<
>;
def : GCNPat<
+ (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
+ (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src, 0, 0)
+>;
+
+def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
(V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
>;
@@ -1411,6 +1416,11 @@ def : GCNPat<
(fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
(V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
>;
+
+def : GCNPat<
+ (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))),
+ (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src, 0, 0)
+>;
}
let OtherPredicates = [FP32Denormals] in {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3fd3c75874a3..4eba19382315 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -110,6 +110,7 @@ struct MIMGInfo {
#define GET_MIMGBaseOpcodesTable_IMPL
#define GET_MIMGDimInfoTable_IMPL
#define GET_MIMGInfoTable_IMPL
+#define GET_MIMGLZMappingTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 70681c271697..5b7af8268cda 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -42,6 +42,7 @@ namespace AMDGPU {
#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
+#define GET_MIMGLZMapping_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
@@ -211,6 +212,14 @@ struct MIMGDimInfo {
LLVM_READONLY
const MIMGDimInfo *getMIMGDimInfo(unsigned Dim);
+struct MIMGLZMappingInfo {
+ MIMGBaseOpcode L;
+ MIMGBaseOpcode LZ;
+};
+
+LLVM_READONLY
+const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
+
LLVM_READONLY
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
unsigned VDataDwords, unsigned VAddrDwords);
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 5c78ada3211e..b51828b54679 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -167,13 +167,30 @@ defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
let SubtargetPredicate = HasDLInsts in {
-def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;
-def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;
-def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;
-def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;
-def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>;
-def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>;
-def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>;
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
+def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
+def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
+def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+
+multiclass DotPats<SDPatternOperator dot_op,
+ VOP3PInst dot_inst> {
+ def : GCNPat <
+ (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
+ (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
+ (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp),
+ (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>;
+}
+
+defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>;
+defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>;
+defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>;
+defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>;
+defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>;
+defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>;
+defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>;
} // End SubtargetPredicate = HasDLInsts