diff options
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 17 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUInstrInfo.td | 5 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUInstructions.td | 1 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp | 26 | ||||
-rw-r--r-- | lib/Target/AMDGPU/MIMGInstructions.td | 26 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.cpp | 147 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.h | 13 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIInsertSkips.cpp | 22 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIInstrInfo.cpp | 30 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIInstrInfo.h | 3 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIInstructions.td | 10 | ||||
-rw-r--r-- | lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 1 | ||||
-rw-r--r-- | lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 9 | ||||
-rw-r--r-- | lib/Target/AMDGPU/VOP3PInstructions.td | 31 |
14 files changed, 246 insertions, 95 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index b201126c593b..21e44e9589d3 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -554,6 +554,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case ISD::FTRUNC: case ISD::FRINT: case ISD::FNEARBYINT: + case ISD::FCANONICALIZE: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RCP_IFLAG: @@ -907,6 +908,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( LLVMContext &Ctx = Fn.getParent()->getContext(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); + CallingConv::ID CC = Fn.getCallingConv(); unsigned MaxAlign = 1; uint64_t ExplicitArgOffset = 0; @@ -940,16 +942,10 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( EVT ArgVT = ValueVTs[Value]; EVT MemVT = ArgVT; - MVT RegisterVT = - getRegisterTypeForCallingConv(Ctx, ArgVT); - unsigned NumRegs = - getNumRegistersForCallingConv(Ctx, ArgVT); - - if (!Subtarget->isAmdHsaOS() && - (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) { - // The ABI says the caller will extend these values to 32-bits. - MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32; - } else if (NumRegs == 1) { + MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT); + unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT); + + if (NumRegs == 1) { // This argument is not split, so the IR type is the memory type. if (ArgVT.isExtended()) { // We have an extended type, like i24, so we should just use the @@ -3600,6 +3596,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, case ISD::FRINT: case ISD::FNEARBYINT: // XXX - Should fround be handled? case ISD::FSIN: + case ISD::FCANONICALIZE: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RCP_IFLAG: diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 96b7568eec1f..7442a59e594f 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -342,8 +342,9 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", - SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, - SDTCisFP<0>, SDTCisVec<1>]>, + SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, + SDTCisFP<0>, SDTCisVec<1>, + SDTCisInt<4>]>, []>; def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 9426df399597..c9c932ef2f5f 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -567,6 +567,7 @@ int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding int FP16_ONE = 0x3C00; +int FP16_NEG_ONE = 0xBC00; int V2FP16_ONE = 0x3C003C00; int FP32_ONE = 0x3f800000; int FP32_NEG_ONE = 0xbf800000; diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 8cc7e38f7b29..c147830e12ed 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -100,16 +100,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { unsigned Size = DL.getTypeSizeInBits(ArgTy); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); - - // Clover seems to always pad i8/i16 to i32, but doesn't properly align - // them? - // Make sure the struct elements have correct size and alignment for ext - // args. These seem to be padded up to 4-bytes but not correctly aligned. - bool IsExtArg = AllocSize < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) && - !ST.isAmdHsaOS(); - if (IsExtArg) - AllocSize = 4; - uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; @@ -164,8 +154,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { ArgPtr->getName() + ".cast"); } - assert((!IsExtArg || !IsV3) && "incompatible situation"); - if (IsV3 && Size >= 32) { V4Ty = VectorType::get(VT->getVectorElementType(), 4); // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads @@ -212,20 +200,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { // TODO: Convert noalias arg to !noalias if (Size < 32 && !ArgTy->isAggregateType()) { - if (IsExtArg && OffsetDiff == 0) { - Type *I32Ty = Builder.getInt32Ty(); - bool IsSext = Arg.hasSExtAttr(); - Metadata *LowAndHigh[] = { - ConstantAsMetadata::get( - ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)), - ConstantAsMetadata::get( - ConstantInt::get(I32Ty, - IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1)) - }; - - Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh)); - } - Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 1e0bc62c45a6..44c2d366e461 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -66,6 +66,22 @@ def MIMGDimInfoTable : GenericTable { let PrimaryKeyName = "getMIMGDimInfo"; } +class MIMGLZMapping<MIMGBaseOpcode l, MIMGBaseOpcode lz> { + MIMGBaseOpcode L = l; + MIMGBaseOpcode LZ = lz; +} + +def MIMGLZMappingTable : GenericTable { + let FilterClass = "MIMGLZMapping"; + let CppTypeName = "MIMGLZMappingInfo"; + let Fields = ["L", "LZ"]; + GenericEnum TypeOf_L = MIMGBaseOpcode; + GenericEnum TypeOf_LZ = MIMGBaseOpcode; + + let PrimaryKey = ["L"]; + let PrimaryKeyName = "getMIMGLZMappingInfo"; +} + class mimg <bits<7> si, bits<7> vi = si> { field bits<7> SI = si; field bits<7> VI = vi; @@ -547,3 +563,13 @@ foreach intr = !listconcat(AMDGPUImageDimIntrinsics, AMDGPUImageDimAtomicIntrinsics) in { def : ImageDimIntrinsicInfo<intr>; } + +// L to LZ Optimization Mapping +def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>; +def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>; +def : MIMGLZMapping<IMAGE_SAMPLE_L_O, IMAGE_SAMPLE_LZ_O>; +def : MIMGLZMapping<IMAGE_SAMPLE_C_L_O, IMAGE_SAMPLE_C_LZ_O>; +def : MIMGLZMapping<IMAGE_GATHER4_L, IMAGE_GATHER4_LZ>; +def : MIMGLZMapping<IMAGE_GATHER4_C_L, IMAGE_GATHER4_C_LZ>; +def : MIMGLZMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_LZ_O>; +def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 5b7fc2656a20..25007861fd15 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -694,6 +694,87 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { return false; } +MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const { + // TODO: Consider splitting all arguments into 32-bit pieces. + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + EVT ScalarVT = VT.getScalarType(); + unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32) + return ScalarVT.getSimpleVT(); + + if (Size == 64) + return MVT::i32; + + if (Size == 16 && + Subtarget->has16BitInsts() && + isPowerOf2_32(VT.getVectorNumElements())) + return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + } + + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); +} + +unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const { + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + unsigned NumElts = VT.getVectorNumElements(); + EVT ScalarVT = VT.getScalarType(); + unsigned Size = ScalarVT.getSizeInBits(); + + if (Size == 32) + return NumElts; + + if (Size == 64) + return 2 * NumElts; + + // FIXME: Fails to break down as we want with v3. + if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) + return VT.getVectorNumElements() / 2; + } + + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); +} + +unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, + EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const { + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + unsigned NumElts = VT.getVectorNumElements(); + EVT ScalarVT = VT.getScalarType(); + unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32) { + RegisterVT = ScalarVT.getSimpleVT(); + IntermediateVT = RegisterVT; + NumIntermediates = NumElts; + return NumIntermediates; + } + + if (Size == 64) { + RegisterVT = MVT::i32; + IntermediateVT = RegisterVT; + NumIntermediates = 2 * NumElts; + return NumIntermediates; + } + + // FIXME: We should fix the ABI to be the same on targets without 16-bit + // support, but unless we can properly handle 3-vectors, it will be still be + // inconsistent. + if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) { + RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + IntermediateVT = RegisterVT; + NumIntermediates = NumElts / 2; + return NumIntermediates; + } + } + + return TargetLowering::getVectorTypeBreakdownForCallingConv( + Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -1268,6 +1349,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { const ISD::InputArg *Arg = &Ins[I]; + assert(!Arg->VT.isVector() && "vector type argument should have been split"); + // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) { @@ -1301,25 +1384,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, ++PSInputNum; } - // Second split vertices into their elements. - if (Arg->VT.isVector()) { - ISD::InputArg NewArg = *Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg->VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eight. - Type *ParamType = FType->getParamType(Arg->getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned J = 0; J != NumElements; ++J) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); - } - } else { - Splits.push_back(*Arg); - } + Splits.push_back(*Arg); } } @@ -4490,6 +4555,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); + const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = + AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); + unsigned IntrOpcode = Intr->BaseOpcode; SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; @@ -4575,6 +4643,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector<SDValue, 4> VAddrs; for (unsigned i = 0; i < NumVAddrs; ++i) VAddrs.push_back(Op.getOperand(AddrIdx + i)); + + // Optimize _L to _LZ when _L is zero + if (LZMappingInfo) { + if (auto ConstantLod = + dyn_cast<ConstantFPSDNode>(VAddrs[NumVAddrs-1].getNode())) { + if (ConstantLod->isZero() || ConstantLod->isNegative()) { + IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l + VAddrs.pop_back(); // remove 'lod' + } + } + } + SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs); SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); @@ -4634,10 +4714,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, int Opcode = -1; if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8, + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, NumVDataDwords, NumVAddrDwords); if (Opcode == -1) - Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6, + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, NumVDataDwords, NumVAddrDwords); assert(Opcode != -1); @@ -4945,7 +5025,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fdot2: return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -6754,10 +6835,6 @@ static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 || ST->hasFP16Denormals(); - case ISD::FP16_TO_FP: - case ISD::FP_TO_FP16: - return ST->hasFP16Denormals(); - // It can/will be lowered or combined as a bit operation. // Need to check their input recursively to handle. case ISD::FNEG: @@ -6799,8 +6876,16 @@ SDValue SITargetLowering::performFCanonicalizeCombine( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; - ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0)); + SDValue N0 = N->getOperand(0); + // fcanonicalize undef -> qnan + if (N0.isUndef()) { + EVT VT = N->getValueType(0); + APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT)); + return DAG.getConstantFP(QNaN, SDLoc(N), VT); + } + + ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0); if (!CFP) { SDValue N0 = N->getOperand(0); EVT VT = N0.getValueType().getScalarType(); @@ -6853,7 +6938,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine( return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); } - return N->getOperand(0); + return N0; } static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { @@ -7544,8 +7629,10 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, return SDValue(); if ((Vec1 == Vec3 && Vec2 == Vec4) || - (Vec1 == Vec4 && Vec2 == Vec3)) - return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc); + (Vec1 == Vec4 && Vec2 == Vec3)) { + return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc, + DAG.getTargetConstant(0, SL, MVT::i1)); + } } return SDValue(); } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index ad049f2a71c3..5b3d49b3d8e3 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -25,6 +25,19 @@ class SITargetLowering final : public AMDGPUTargetLowering { private: const GCNSubtarget *Subtarget; +public: + MVT getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const override; + unsigned getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const override; + + unsigned getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const override; + +private: SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp index 61c8f359e168..dc9397cf7b85 100644 --- a/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -133,28 +133,10 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) return true; - // V_READFIRSTLANE/V_READLANE destination register may be used as operand - // by some SALU instruction. If exec mask is zero vector instruction - // defining the register that is used by the scalar one is not executed - // and scalar instruction will operate on undefined data. For - // V_READFIRSTLANE/V_READLANE we should avoid predicated execution. - if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) || - (I->getOpcode() == AMDGPU::V_READLANE_B32)) { + if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) return true; - } - - if (I->isInlineAsm()) { - const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - const char *AsmStr = I->getOperand(0).getSymbolName(); - - // inlineasm length estimate is number of bytes assuming the longest - // instruction. - uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); - NumInstr += MaxAsmSize / MAI->getMaxInstLength(); - } else { - ++NumInstr; - } + ++NumInstr; if (NumInstr >= SkipThreshold) return true; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 6c85c92454c3..f3745382a6f4 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2332,6 +2332,36 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, changesVGPRIndexingMode(MI); } +bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + + if (MI.mayStore() && isSMRD(MI)) + return true; // scalar store or atomic + + // These instructions cause shader I/O that may cause hardware lockups + // when executed with an empty EXEC mask. + // + // Note: exp with VM = DONE = 0 is automatically skipped by hardware when + // EXEC = 0, but checking for that case here seems not worth it + // given the typical code patterns. + if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || + Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE) + return true; + + if (MI.isInlineAsm()) + return true; // conservative assumption + + // These are like SALU instructions in terms of effects, so it's questionable + // whether we should return true for those. + // + // However, executing them with EXEC = 0 causes them to operate on undefined + // data, which we avoid by returning true here. + if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32) + return true; + + return false; +} + bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { switch (Imm.getBitWidth()) { case 32: diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 0a735257d34e..d681b926504e 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -597,6 +597,9 @@ public: return !RI.isSGPRReg(MRI, Dest); } + /// Whether we must prevent this instruction from executing with EXEC = 0. + bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const; + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index c3f8bfb53ef4..5c10646161b3 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1387,6 +1387,11 @@ def : GCNPat< >; def : GCNPat< + (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), + (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src, 0, 0) +>; + +def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) >; @@ -1411,6 +1416,11 @@ def : GCNPat< (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0) >; + +def : GCNPat< + (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), + (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src, 0, 0) +>; } let OtherPredicates = [FP32Denormals] in { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 3fd3c75874a3..4eba19382315 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -110,6 +110,7 @@ struct MIMGInfo { #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL +#define GET_MIMGLZMappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 70681c271697..5b7af8268cda 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -42,6 +42,7 @@ namespace AMDGPU { #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL +#define GET_MIMGLZMapping_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -211,6 +212,14 @@ struct MIMGDimInfo { LLVM_READONLY const MIMGDimInfo *getMIMGDimInfo(unsigned Dim); +struct MIMGLZMappingInfo { + MIMGBaseOpcode L; + MIMGBaseOpcode LZ; +}; + +LLVM_READONLY +const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L); + LLVM_READONLY int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords); diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index 5c78ada3211e..b51828b54679 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -167,13 +167,30 @@ defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; let SubtargetPredicate = HasDLInsts in { -def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>; -def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>; -def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>; -def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>; -def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>; -def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>; -def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>; +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>; +def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; +def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; +def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; +def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; +def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; +def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; + +multiclass DotPats<SDPatternOperator dot_op, + VOP3PInst dot_inst> { + def : GCNPat < + (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)), + (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)), + (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp), + (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>; +} + +defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>; +defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>; +defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>; +defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>; +defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>; +defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>; +defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>; } // End SubtargetPredicate = HasDLInsts |