diff options
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 250 |
1 files changed, 81 insertions, 169 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 39016ed37193..1115d8c23620 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -12,10 +12,6 @@ // //===----------------------------------------------------------------------===// -#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f -#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f -#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f - #include "AMDGPUISelLowering.h" #include "AMDGPU.h" #include "AMDGPUCallLowering.h" @@ -37,82 +33,9 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; -static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State, - const TargetRegisterClass *RC, - unsigned NumRegs) { - ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs); - unsigned RegResult = State.AllocateReg(RegList); - if (RegResult == AMDGPU::NoRegister) - return false; - - State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo)); - return true; -} - -static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - switch (LocVT.SimpleTy) { - case MVT::i64: - case MVT::f64: - case MVT::v2i32: - case MVT::v2f32: - case MVT::v4i16: - case MVT::v4f16: { - // Up to SGPR0-SGPR105 - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::SGPR_64RegClass, 53); - } - default: - return false; - } -} - -// Allocate up to VGPR31. -// -// TODO: Since there are no VGPR alignent requirements would it be better to -// split into individual scalar registers? -static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - switch (LocVT.SimpleTy) { - case MVT::i64: - case MVT::f64: - case MVT::v2i32: - case MVT::v2f32: - case MVT::v4i16: - case MVT::v4f16: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_64RegClass, 31); - } - case MVT::v4i32: - case MVT::v4f32: - case MVT::v2i64: - case MVT::v2f64: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_128RegClass, 29); - } - case MVT::v8i32: - case MVT::v8f32: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_256RegClass, 25); - - } - case MVT::v16i32: - case MVT::v16f32: { - return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::VReg_512RegClass, 17); - - } - default: - return false; - } -} - #include "AMDGPUGenCallingConv.inc" // Find a larger type to do a load / store of a vector with. @@ -208,7 +131,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); } - for (MVT VT : MVT::integer_vector_valuetypes()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); @@ -218,6 +141,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); @@ -225,8 +151,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); @@ -286,8 +215,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); + setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand); + setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -571,6 +503,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FABS); setTargetDAGCombine(ISD::AssertZext); setTargetDAGCombine(ISD::AssertSext); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); } //===----------------------------------------------------------------------===// @@ -630,15 +563,26 @@ static bool hasSourceMods(const SDNode *N) { case ISD::FREM: case ISD::INLINEASM: case ISD::INLINEASM_BR: - case AMDGPUISD::INTERP_P1: - case AMDGPUISD::INTERP_P2: case AMDGPUISD::DIV_SCALE: + case ISD::INTRINSIC_W_CHAIN: // TODO: Should really be looking at the users of the bitcast. These are // problematic because bitcasts are used to legalize all stores to integer // types. case ISD::BITCAST: return false; + case ISD::INTRINSIC_WO_CHAIN: { + switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) { + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_interp_mov: + case Intrinsic::amdgcn_interp_p1_f16: + case Intrinsic::amdgcn_interp_p2_f16: + return false; + default: + return true; + } + } default: return true; } @@ -745,8 +689,9 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, return false; bool Fast = false; - return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy, - MMO, &Fast) && Fast; + return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + CastTy, MMO, &Fast) && + Fast; } // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also @@ -782,9 +727,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { break; case ISD::LOAD: { - const LoadSDNode * L = dyn_cast<LoadSDNode>(N); - if (L->getMemOperand()->getAddrSpace() - == AMDGPUAS::CONSTANT_ADDRESS_32BIT) + if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() == + AMDGPUAS::CONSTANT_ADDRESS_32BIT) return true; return false; } @@ -1199,9 +1143,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::FLOG: - return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F); + return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef); case ISD::FLOG10: - return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F); + return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f); case ISD::FEXP: return lowerFEXP(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); @@ -1236,7 +1180,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, } } -static bool hasDefinedInitializer(const GlobalValue *GV) { +bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) { const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); if (!GVar || !GVar->hasInitializer()) return false; @@ -2349,30 +2293,13 @@ SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand); } -// Return M_LOG2E of appropriate type -static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) { - switch (VT.getScalarType().getSimpleVT().SimpleTy) { - case MVT::f32: - return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT); - case MVT::f16: - return DAG.getConstantFP( - APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"), - SL, VT); - case MVT::f64: - return DAG.getConstantFP( - APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT); - default: - llvm_unreachable("unsupported fp type"); - } -} - // exp2(M_LOG2E_F * f); SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc SL(Op); SDValue Src = Op.getOperand(0); - const SDValue K = getLog2EVal(DAG, SL, VT); + const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT); SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags()); return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags()); } @@ -2836,8 +2763,16 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) { static SDValue simplifyI24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - SDValue LHS = Node24->getOperand(0); - SDValue RHS = Node24->getOperand(1); + bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; + + SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); + SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1); + unsigned NewOpcode = Node24->getOpcode(); + if (IsIntrin) { + unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue(); + NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ? + AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; + } APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); @@ -2847,7 +2782,7 @@ static SDValue simplifyI24(SDNode *Node24, SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded); SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded); if (DemandedLHS || DemandedRHS) - return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(), + return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), DemandedLHS ? DemandedLHS : LHS, DemandedRHS ? DemandedRHS : RHS); @@ -2904,54 +2839,6 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { return true; } -// Find a load or store from corresponding pattern root. -// Roots may be build_vector, bitconvert or their combinations. -static MemSDNode* findMemSDNode(SDNode *N) { - N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode(); - if (MemSDNode *MN = dyn_cast<MemSDNode>(N)) - return MN; - assert(isa<BuildVectorSDNode>(N)); - for (SDValue V : N->op_values()) - if (MemSDNode *MN = - dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V))) - return MN; - llvm_unreachable("cannot find MemSDNode in the pattern!"); -} - -bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned, - SelectionDAG &DAG, - SDNode *N, - SDValue Addr, - SDValue &VAddr, - SDValue &Offset, - SDValue &SLC) const { - const GCNSubtarget &ST = - DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); - int64_t OffsetVal = 0; - - if (ST.hasFlatInstOffsets() && - (!ST.hasFlatSegmentOffsetBug() || - findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && - DAG.isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); - - const SIInstrInfo *TII = ST.getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(), - IsSigned)) { - Addr = N0; - OffsetVal = COffsetVal; - } - } - - VAddr = Addr; - Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16); - SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1); - - return true; -} - // Replace load of an illegal type with a store of a bitcast to a friendlier // type. SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, @@ -3085,6 +2972,19 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, return SDValue(); } + +SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_mul_i24: + case Intrinsic::amdgcn_mul_u24: + return simplifyI24(N, DCI); + default: + return SDValue(); + } +} + /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( @@ -4173,6 +4073,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, case ISD::AssertZext: case ISD::AssertSext: return performAssertSZExtCombine(N, DCI); + case ISD::INTRINSIC_WO_CHAIN: + return performIntrinsicWOChainCombine(N, DCI); } return SDValue(); } @@ -4203,14 +4105,28 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); } +// This may be called multiple times, and nothing prevents creating multiple +// objects at the same offset. See if we already defined this object. +static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, + int64_t Offset) { + for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { + if (MFI.getObjectOffset(I) == Offset) { + assert(MFI.getObjectSize(I) == Size); + return I; + } + } + + return MFI.CreateFixedObject(Size, Offset, true); +} + SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); + int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset); - int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true); auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); @@ -4260,7 +4176,7 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction()); unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction()); - unsigned Alignment = ST.getAlignmentForImplicitArgPtr(); + const Align Alignment = ST.getAlignmentForImplicitArgPtr(); uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) + ExplicitArgOffset; switch (Param) { @@ -4295,6 +4211,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) NODE_NAME_CASE(SETREG) + NODE_NAME_CASE(DENORM_MODE) NODE_NAME_CASE(FMA_W_CHAIN) NODE_NAME_CASE(FMUL_W_CHAIN) NODE_NAME_CASE(CLAMP) @@ -4377,13 +4294,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; - NODE_NAME_CASE(INIT_EXEC) - NODE_NAME_CASE(INIT_EXEC_FROM_INPUT) - NODE_NAME_CASE(SENDMSG) - NODE_NAME_CASE(SENDMSGHALT) - NODE_NAME_CASE(INTERP_MOV) - NODE_NAME_CASE(INTERP_P1) - NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(INTERP_P1LL_F16) NODE_NAME_CASE(INTERP_P1LV_F16) NODE_NAME_CASE(INTERP_P2_F16) @@ -4428,6 +4338,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_AND) NODE_NAME_CASE(BUFFER_ATOMIC_OR) NODE_NAME_CASE(BUFFER_ATOMIC_XOR) + NODE_NAME_CASE(BUFFER_ATOMIC_INC) + NODE_NAME_CASE(BUFFER_ATOMIC_DEC) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) @@ -4576,9 +4488,9 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; } else if (SelBits == 0x0c) { - Known.Zero |= 0xff << I; + Known.Zero |= 0xFFull << I; } else if (SelBits > 0x0c) { - Known.One |= 0xff << I; + Known.One |= 0xFFull << I; } Sel >>= 8; } |