summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp163
1 files changed, 124 insertions, 39 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f80652b87373..5ec46a8294c0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -76,6 +76,45 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
}
}
+// Allocate up to VGPR31.
+//
+// TODO: Since there are no VGPR alignent requirements would it be better to
+// split into individual scalar registers?
+static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ switch (LocVT.SimpleTy) {
+ case MVT::i64:
+ case MVT::f64:
+ case MVT::v2i32:
+ case MVT::v2f32: {
+ return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+ &AMDGPU::VReg_64RegClass, 31);
+ }
+ case MVT::v4i32:
+ case MVT::v4f32:
+ case MVT::v2i64:
+ case MVT::v2f64: {
+ return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+ &AMDGPU::VReg_128RegClass, 29);
+ }
+ case MVT::v8i32:
+ case MVT::v8f32: {
+ return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+ &AMDGPU::VReg_256RegClass, 25);
+
+ }
+ case MVT::v16i32:
+ case MVT::v16f32: {
+ return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+ &AMDGPU::VReg_512RegClass, 17);
+
+ }
+ default:
+ return false;
+ }
+}
+
#include "AMDGPUGenCallingConv.inc"
// Find a larger type to do a load / store of a vector with.
@@ -773,8 +812,43 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
//===---------------------------------------------------------------------===//
CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
- bool IsVarArg) const {
- return CC_AMDGPU;
+ bool IsVarArg) {
+ switch (CC) {
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ return CC_AMDGPU_Kernel;
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_HS:
+ return CC_AMDGPU;
+ case CallingConv::C:
+ case CallingConv::Fast:
+ return CC_AMDGPU_Func;
+ default:
+ report_fatal_error("Unsupported calling convention.");
+ }
+}
+
+CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
+ bool IsVarArg) {
+ switch (CC) {
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ return CC_AMDGPU_Kernel;
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_HS:
+ return RetCC_SI_Shader;
+ case CallingConv::C:
+ case CallingConv::Fast:
+ return RetCC_AMDGPU_Func;
+ default:
+ report_fatal_error("Unsupported calling convention.");
+ }
}
/// The SelectionDAGBuilder will automatically promote function arguments
@@ -874,18 +948,15 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
}
}
-void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
- const SmallVectorImpl<ISD::OutputArg> &Outs) const {
-
- State.AnalyzeReturn(Outs, RetCC_SI);
-}
-
-SDValue
-AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SDLoc &DL, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerReturn(
+ SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ // FIXME: Fails for r600 tests
+ //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
+ // "wave terminate should not have return values");
return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
}
@@ -896,20 +967,12 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) {
- switch (CC) {
- case CallingConv::C:
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::SPIR_KERNEL:
- return CC_AMDGPU_Kernel;
- case CallingConv::AMDGPU_VS:
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_GS:
- case CallingConv::AMDGPU_PS:
- case CallingConv::AMDGPU_CS:
- return CC_AMDGPU;
- default:
- report_fatal_error("Unsupported calling convention.");
- }
+ return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
+}
+
+CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
+ bool IsVarArg) {
+ return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
}
SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -2532,27 +2595,49 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
- if (N->getValueType(0) != MVT::i64)
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i64)
return SDValue();
- // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
-
- // On some subtargets, 64-bit shift is a quarter rate instruction. In the
- // common case, splitting this into a move and a 32-bit shift is faster and
- // the same code size.
- const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!RHS)
return SDValue();
- unsigned RHSVal = RHS->getZExtValue();
- if (RHSVal < 32)
- return SDValue();
-
SDValue LHS = N->getOperand(0);
+ unsigned RHSVal = RHS->getZExtValue();
+ if (!RHSVal)
+ return LHS;
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
+ switch (LHS->getOpcode()) {
+ default:
+ break;
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ANY_EXTEND: {
+ // shl (ext x) => zext (shl x), if shift does not overflow int
+ KnownBits Known;
+ SDValue X = LHS->getOperand(0);
+ DAG.computeKnownBits(X, Known);
+ unsigned LZ = Known.countMinLeadingZeros();
+ if (LZ < RHSVal)
+ break;
+ EVT XVT = X.getValueType();
+ SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
+ return DAG.getZExtOrTrunc(Shl, SL, VT);
+ }
+ }
+
+ // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
+
+ // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+ // common case, splitting this into a move and a 32-bit shift is faster and
+ // the same code size.
+ if (RHSVal < 32)
+ return SDValue();
+
SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);