aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp250
1 files changed, 81 insertions, 169 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 39016ed37193..1115d8c23620 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -12,10 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f
-#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f
-#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f
-
#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUCallLowering.h"
@@ -37,82 +33,9 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
using namespace llvm;
-static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State,
- const TargetRegisterClass *RC,
- unsigned NumRegs) {
- ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
- unsigned RegResult = State.AllocateReg(RegList);
- if (RegResult == AMDGPU::NoRegister)
- return false;
-
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
- return true;
-}
-
-static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- switch (LocVT.SimpleTy) {
- case MVT::i64:
- case MVT::f64:
- case MVT::v2i32:
- case MVT::v2f32:
- case MVT::v4i16:
- case MVT::v4f16: {
- // Up to SGPR0-SGPR105
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::SGPR_64RegClass, 53);
- }
- default:
- return false;
- }
-}
-
-// Allocate up to VGPR31.
-//
-// TODO: Since there are no VGPR alignent requirements would it be better to
-// split into individual scalar registers?
-static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- switch (LocVT.SimpleTy) {
- case MVT::i64:
- case MVT::f64:
- case MVT::v2i32:
- case MVT::v2f32:
- case MVT::v4i16:
- case MVT::v4f16: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_64RegClass, 31);
- }
- case MVT::v4i32:
- case MVT::v4f32:
- case MVT::v2i64:
- case MVT::v2f64: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_128RegClass, 29);
- }
- case MVT::v8i32:
- case MVT::v8f32: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_256RegClass, 25);
-
- }
- case MVT::v16i32:
- case MVT::v16f32: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_512RegClass, 17);
-
- }
- default:
- return false;
- }
-}
-
#include "AMDGPUGenCallingConv.inc"
// Find a larger type to do a load / store of a vector with.
@@ -208,7 +131,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
}
- for (MVT VT : MVT::integer_vector_valuetypes()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
@@ -218,6 +141,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
@@ -225,8 +151,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
@@ -286,8 +215,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+ setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
+ setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -571,6 +503,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FABS);
setTargetDAGCombine(ISD::AssertZext);
setTargetDAGCombine(ISD::AssertSext);
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
}
//===----------------------------------------------------------------------===//
@@ -630,15 +563,26 @@ static bool hasSourceMods(const SDNode *N) {
case ISD::FREM:
case ISD::INLINEASM:
case ISD::INLINEASM_BR:
- case AMDGPUISD::INTERP_P1:
- case AMDGPUISD::INTERP_P2:
case AMDGPUISD::DIV_SCALE:
+ case ISD::INTRINSIC_W_CHAIN:
// TODO: Should really be looking at the users of the bitcast. These are
// problematic because bitcasts are used to legalize all stores to integer
// types.
case ISD::BITCAST:
return false;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
+ case Intrinsic::amdgcn_interp_p1:
+ case Intrinsic::amdgcn_interp_p2:
+ case Intrinsic::amdgcn_interp_mov:
+ case Intrinsic::amdgcn_interp_p1_f16:
+ case Intrinsic::amdgcn_interp_p2_f16:
+ return false;
+ default:
+ return true;
+ }
+ }
default:
return true;
}
@@ -745,8 +689,9 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
return false;
bool Fast = false;
- return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy,
- MMO, &Fast) && Fast;
+ return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ CastTy, MMO, &Fast) &&
+ Fast;
}
// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
@@ -782,9 +727,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
break;
case ISD::LOAD:
{
- const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
- if (L->getMemOperand()->getAddrSpace()
- == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
+ AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return true;
return false;
}
@@ -1199,9 +1143,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::FLOG:
- return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
+ return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
case ISD::FLOG10:
- return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
+ return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
case ISD::FEXP:
return lowerFEXP(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
@@ -1236,7 +1180,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
}
}
-static bool hasDefinedInitializer(const GlobalValue *GV) {
+bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
if (!GVar || !GVar->hasInitializer())
return false;
@@ -2349,30 +2293,13 @@ SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
}
-// Return M_LOG2E of appropriate type
-static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) {
- switch (VT.getScalarType().getSimpleVT().SimpleTy) {
- case MVT::f32:
- return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT);
- case MVT::f16:
- return DAG.getConstantFP(
- APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"),
- SL, VT);
- case MVT::f64:
- return DAG.getConstantFP(
- APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT);
- default:
- llvm_unreachable("unsupported fp type");
- }
-}
-
// exp2(M_LOG2E_F * f);
SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
- const SDValue K = getLog2EVal(DAG, SL, VT);
+ const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
}
@@ -2836,8 +2763,16 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
static SDValue simplifyI24(SDNode *Node24,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
- SDValue LHS = Node24->getOperand(0);
- SDValue RHS = Node24->getOperand(1);
+ bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
+
+ SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
+ SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
+ unsigned NewOpcode = Node24->getOpcode();
+ if (IsIntrin) {
+ unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
+ NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
+ AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ }
APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
@@ -2847,7 +2782,7 @@ static SDValue simplifyI24(SDNode *Node24,
SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
if (DemandedLHS || DemandedRHS)
- return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(),
+ return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
DemandedLHS ? DemandedLHS : LHS,
DemandedRHS ? DemandedRHS : RHS);
@@ -2904,54 +2839,6 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
return true;
}
-// Find a load or store from corresponding pattern root.
-// Roots may be build_vector, bitconvert or their combinations.
-static MemSDNode* findMemSDNode(SDNode *N) {
- N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
- if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
- return MN;
- assert(isa<BuildVectorSDNode>(N));
- for (SDValue V : N->op_values())
- if (MemSDNode *MN =
- dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
- return MN;
- llvm_unreachable("cannot find MemSDNode in the pattern!");
-}
-
-bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned,
- SelectionDAG &DAG,
- SDNode *N,
- SDValue Addr,
- SDValue &VAddr,
- SDValue &Offset,
- SDValue &SLC) const {
- const GCNSubtarget &ST =
- DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
- int64_t OffsetVal = 0;
-
- if (ST.hasFlatInstOffsets() &&
- (!ST.hasFlatSegmentOffsetBug() ||
- findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
- DAG.isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
- int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
- const SIInstrInfo *TII = ST.getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(),
- IsSigned)) {
- Addr = N0;
- OffsetVal = COffsetVal;
- }
- }
-
- VAddr = Addr;
- Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
- SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1);
-
- return true;
-}
-
// Replace load of an illegal type with a store of a bitcast to a friendlier
// type.
SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
@@ -3085,6 +2972,19 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
return SDValue();
}
+
+SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IID) {
+ case Intrinsic::amdgcn_mul_i24:
+ case Intrinsic::amdgcn_mul_u24:
+ return simplifyI24(N, DCI);
+ default:
+ return SDValue();
+ }
+}
+
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
/// binary operation \p Opc to it with the corresponding constant operands.
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -4173,6 +4073,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AssertZext:
case ISD::AssertSext:
return performAssertSZExtCombine(N, DCI);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return performIntrinsicWOChainCombine(N, DCI);
}
return SDValue();
}
@@ -4203,14 +4105,28 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
}
+// This may be called multiple times, and nothing prevents creating multiple
+// objects at the same offset. See if we already defined this object.
+static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
+ int64_t Offset) {
+ for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
+ if (MFI.getObjectOffset(I) == Offset) {
+ assert(MFI.getObjectSize(I) == Size);
+ return I;
+ }
+ }
+
+ return MFI.CreateFixedObject(Size, Offset, true);
+}
+
SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
EVT VT,
const SDLoc &SL,
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
- int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
@@ -4260,7 +4176,7 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
const AMDGPUSubtarget &ST =
AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
- unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
+ const Align Alignment = ST.getAlignmentForImplicitArgPtr();
uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
ExplicitArgOffset;
switch (Param) {
@@ -4295,6 +4211,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(SETCC)
NODE_NAME_CASE(SETREG)
+ NODE_NAME_CASE(DENORM_MODE)
NODE_NAME_CASE(FMA_W_CHAIN)
NODE_NAME_CASE(FMUL_W_CHAIN)
NODE_NAME_CASE(CLAMP)
@@ -4377,13 +4294,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
- NODE_NAME_CASE(INIT_EXEC)
- NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
- NODE_NAME_CASE(SENDMSG)
- NODE_NAME_CASE(SENDMSGHALT)
- NODE_NAME_CASE(INTERP_MOV)
- NODE_NAME_CASE(INTERP_P1)
- NODE_NAME_CASE(INTERP_P2)
NODE_NAME_CASE(INTERP_P1LL_F16)
NODE_NAME_CASE(INTERP_P1LV_F16)
NODE_NAME_CASE(INTERP_P2_F16)
@@ -4428,6 +4338,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_AND)
NODE_NAME_CASE(BUFFER_ATOMIC_OR)
NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
+ NODE_NAME_CASE(BUFFER_ATOMIC_INC)
+ NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
@@ -4576,9 +4488,9 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
} else if (SelBits == 0x0c) {
- Known.Zero |= 0xff << I;
+ Known.Zero |= 0xFFull << I;
} else if (SelBits > 0x0c) {
- Known.One |= 0xff << I;
+ Known.One |= 0xFFull << I;
}
Sel >>= 8;
}