summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp147
1 files changed, 117 insertions, 30 deletions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b7fc2656a20..25007861fd15 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -694,6 +694,87 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
return false;
}
+MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ // TODO: Consider splitting all arguments into 32-bit pieces.
+ if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+ EVT ScalarVT = VT.getScalarType();
+ unsigned Size = ScalarVT.getSizeInBits();
+ if (Size == 32)
+ return ScalarVT.getSimpleVT();
+
+ if (Size == 64)
+ return MVT::i32;
+
+ if (Size == 16 &&
+ Subtarget->has16BitInsts() &&
+ isPowerOf2_32(VT.getVectorNumElements()))
+ return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ }
+
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
+unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT ScalarVT = VT.getScalarType();
+ unsigned Size = ScalarVT.getSizeInBits();
+
+ if (Size == 32)
+ return NumElts;
+
+ if (Size == 64)
+ return 2 * NumElts;
+
+ // FIXME: Fails to break down as we want with v3.
+ if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts))
+ return VT.getVectorNumElements() / 2;
+ }
+
+ return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+}
+
+unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC,
+ EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const {
+ if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT ScalarVT = VT.getScalarType();
+ unsigned Size = ScalarVT.getSizeInBits();
+ if (Size == 32) {
+ RegisterVT = ScalarVT.getSimpleVT();
+ IntermediateVT = RegisterVT;
+ NumIntermediates = NumElts;
+ return NumIntermediates;
+ }
+
+ if (Size == 64) {
+ RegisterVT = MVT::i32;
+ IntermediateVT = RegisterVT;
+ NumIntermediates = 2 * NumElts;
+ return NumIntermediates;
+ }
+
+ // FIXME: We should fix the ABI to be the same on targets without 16-bit
+ // support, but unless we can properly handle 3-vectors, it will be still be
+ // inconsistent.
+ if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) {
+ RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ IntermediateVT = RegisterVT;
+ NumIntermediates = NumElts / 2;
+ return NumIntermediates;
+ }
+ }
+
+ return TargetLowering::getVectorTypeBreakdownForCallingConv(
+ Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -1268,6 +1349,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
const ISD::InputArg *Arg = &Ins[I];
+ assert(!Arg->VT.isVector() && "vector type argument should have been split");
+
// First check if it's a PS input addr.
if (CallConv == CallingConv::AMDGPU_PS &&
!Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
@@ -1301,25 +1384,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
++PSInputNum;
}
- // Second split vertices into their elements.
- if (Arg->VT.isVector()) {
- ISD::InputArg NewArg = *Arg;
- NewArg.Flags.setSplit();
- NewArg.VT = Arg->VT.getVectorElementType();
-
- // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
- // three or five element vertex only needs three or five registers,
- // NOT four or eight.
- Type *ParamType = FType->getParamType(Arg->getOrigArgIndex());
- unsigned NumElements = ParamType->getVectorNumElements();
-
- for (unsigned J = 0; J != NumElements; ++J) {
- Splits.push_back(NewArg);
- NewArg.PartOffset += NewArg.VT.getStoreSize();
- }
- } else {
- Splits.push_back(*Arg);
- }
+ Splits.push_back(*Arg);
}
}
@@ -4490,6 +4555,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
+ const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
+ AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
+ unsigned IntrOpcode = Intr->BaseOpcode;
SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
bool IsD16 = false;
@@ -4575,6 +4643,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SmallVector<SDValue, 4> VAddrs;
for (unsigned i = 0; i < NumVAddrs; ++i)
VAddrs.push_back(Op.getOperand(AddrIdx + i));
+
+ // Optimize _L to _LZ when _L is zero
+ if (LZMappingInfo) {
+ if (auto ConstantLod =
+ dyn_cast<ConstantFPSDNode>(VAddrs[NumVAddrs-1].getNode())) {
+ if (ConstantLod->isZero() || ConstantLod->isNegative()) {
+ IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
+ VAddrs.pop_back(); // remove 'lod'
+ }
+ }
+ }
+
SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
@@ -4634,10 +4714,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
int Opcode = -1;
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
- Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8,
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
- Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6,
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
NumVDataDwords, NumVAddrDwords);
assert(Opcode != -1);
@@ -4945,7 +5025,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_fdot2:
return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(4));
case Intrinsic::amdgcn_fmul_legacy:
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
Op.getOperand(1), Op.getOperand(2));
@@ -6754,10 +6835,6 @@ static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
ST->hasFP16Denormals();
- case ISD::FP16_TO_FP:
- case ISD::FP_TO_FP16:
- return ST->hasFP16Denormals();
-
// It can/will be lowered or combined as a bit operation.
// Need to check their input recursively to handle.
case ISD::FNEG:
@@ -6799,8 +6876,16 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
- ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
+ SDValue N0 = N->getOperand(0);
+ // fcanonicalize undef -> qnan
+ if (N0.isUndef()) {
+ EVT VT = N->getValueType(0);
+ APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
+ return DAG.getConstantFP(QNaN, SDLoc(N), VT);
+ }
+
+ ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0);
if (!CFP) {
SDValue N0 = N->getOperand(0);
EVT VT = N0.getValueType().getScalarType();
@@ -6853,7 +6938,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
}
- return N->getOperand(0);
+ return N0;
}
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
@@ -7544,8 +7629,10 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
return SDValue();
if ((Vec1 == Vec3 && Vec2 == Vec4) ||
- (Vec1 == Vec4 && Vec2 == Vec3))
- return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+ (Vec1 == Vec4 && Vec2 == Vec3)) {
+ return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
+ DAG.getTargetConstant(0, SL, MVT::i1));
+ }
}
return SDValue();
}