diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 520 |
1 files changed, 289 insertions, 231 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7b5248906b56..746f652bfa36 100644 --- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/FPEnv.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instruction.h" @@ -48,7 +49,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -71,14 +71,14 @@ static cl::opt<bool> sched4reg( "nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); -static cl::opt<unsigned> -FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, - cl::desc("NVPTX Specific: FMA contraction (0: don't do it" - " 1: do it 2: do it aggressively"), - cl::init(2)); +static cl::opt<unsigned> FMAContractLevelOpt( + "nvptx-fma-level", cl::Hidden, + cl::desc("NVPTX Specific: FMA contraction (0: don't do it" + " 1: do it 2: do it aggressively"), + cl::init(2)); static cl::opt<int> UsePrecDivF32( - "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, + "nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), cl::init(2)); @@ -487,6 +487,17 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::CTLZ, Ty, Legal); } + setOperationAction(ISD::ADDC, MVT::i32, Legal); + setOperationAction(ISD::ADDE, MVT::i32, Legal); + setOperationAction(ISD::SUBC, MVT::i32, Legal); + setOperationAction(ISD::SUBE, MVT::i32, Legal); + if (STI.getPTXVersion() >= 43) { + setOperationAction(ISD::ADDC, MVT::i64, Legal); + setOperationAction(ISD::ADDE, MVT::i64, Legal); + setOperationAction(ISD::SUBC, MVT::i64, Legal); + setOperationAction(ISD::SUBE, MVT::i64, Legal); + } + setOperationAction(ISD::CTTZ, MVT::i16, Expand); setOperationAction(ISD::CTTZ, MVT::i32, Expand); setOperationAction(ISD::CTTZ, MVT::i64, Expand); @@ -499,13 +510,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); // We have some custom DAG combine patterns for these nodes - setTargetDAGCombine(ISD::ADD); - setTargetDAGCombine(ISD::AND); - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::MUL); - setTargetDAGCombine(ISD::SHL); - setTargetDAGCombine(ISD::SREM); - setTargetDAGCombine(ISD::UREM); + setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::FADD, ISD::MUL, ISD::SHL, + ISD::SREM, ISD::UREM}); // setcc for f16x2 needs special handling to prevent legalizer's // attempt to scalarize it due to v2i1 not being legal. @@ -583,6 +589,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Now deduce the information based on the above mentioned // actions computeRegisterProperties(STI.getRegisterInfo()); + + setMinCmpXchgSizeInBits(32); } const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -1302,8 +1310,8 @@ std::string NVPTXTargetLowering::getPrototype( bool first = true; - unsigned OIdx = 0; - for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { + const Function *F = CB.getFunction(); + for (unsigned i = 0, e = Args.size(), OIdx = 0; i != e; ++i, ++OIdx) { Type *Ty = Args[i].Ty; if (!first) { O << ", "; @@ -1312,15 +1320,14 @@ std::string NVPTXTargetLowering::getPrototype( if (!Outs[OIdx].Flags.isByVal()) { if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { - unsigned align = 0; + unsigned ParamAlign = 0; const CallInst *CallI = cast<CallInst>(&CB); // +1 because index 0 is reserved for return type alignment - if (!getAlign(*CallI, i + 1, align)) - align = DL.getABITypeAlignment(Ty); - unsigned sz = DL.getTypeAllocSize(Ty); - O << ".param .align " << align << " .b8 "; + if (!getAlign(*CallI, i + 1, ParamAlign)) + ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value(); + O << ".param .align " << ParamAlign << " .b8 "; O << "_"; - O << "[" << sz << "]"; + O << "[" << DL.getTypeAllocSize(Ty) << "]"; // update the index for Outs SmallVector<EVT, 16> vtparts; ComputeValueVTs(*this, DL, Ty, vtparts); @@ -1351,15 +1358,18 @@ std::string NVPTXTargetLowering::getPrototype( O << "_"; continue; } - auto *PTy = dyn_cast<PointerType>(Ty); - assert(PTy && "Param with byval attribute should be a pointer type"); - Type *ETy = PTy->getPointerElementType(); - Align align = Outs[OIdx].Flags.getNonZeroByValAlign(); - unsigned sz = DL.getTypeAllocSize(ETy); - O << ".param .align " << align.value() << " .b8 "; + Align ParamByValAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); + + // Try to increase alignment. This code matches logic in LowerCall when + // alignment increase is performed to increase vectorization options. + Type *ETy = Args[i].IndirectType; + Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL); + ParamByValAlign = std::max(ParamByValAlign, AlignCandidate); + + O << ".param .align " << ParamByValAlign.value() << " .b8 "; O << "_"; - O << "[" << sz << "]"; + O << "[" << Outs[OIdx].Flags.getByValSize() << "]"; } O << ");"; return O.str(); @@ -1406,12 +1416,15 @@ Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, // Check for function alignment information if we found that the // ultimate target is a Function - if (DirectCallee) + if (DirectCallee) { if (getAlign(*DirectCallee, Idx, Alignment)) return Align(Alignment); + // If alignment information is not available, fall back to the + // default function param optimized type alignment + return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL); + } - // Call is indirect or alignment information is not available, fall back to - // the ABI type alignment + // Call is indirect, fall back to the ABI type alignment return DL.getABITypeAlign(Ty); } @@ -1436,11 +1449,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, return Chain; unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1); - SDValue tempChain = Chain; + SDValue TempChain = Chain; Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); SDValue InFlag = Chain.getValue(1); - unsigned paramCount = 0; + unsigned ParamCount = 0; // Args.size() and Outs.size() need not match. // Outs.size() will be larger // * if there is an aggregate argument with multiple fields (each field @@ -1456,173 +1469,155 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { EVT VT = Outs[OIdx].VT; Type *Ty = Args[i].Ty; + bool IsByVal = Outs[OIdx].Flags.isByVal(); - if (!Outs[OIdx].Flags.isByVal()) { - SmallVector<EVT, 16> VTs; - SmallVector<uint64_t, 16> Offsets; - ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); - Align ArgAlign = getArgumentAlignment(Callee, CB, Ty, paramCount + 1, DL); - unsigned AllocSize = DL.getTypeAllocSize(Ty); - SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - bool NeedAlign; // Does argument declaration specify alignment? - if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { - // declare .param .align <align> .b8 .param<n>[<size>]; - SDValue DeclareParamOps[] = { - Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; - Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps); - NeedAlign = true; - } else { - // declare .param .b<size> .param<n>; - if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { - // PTX ABI requires integral types to be at least 32 bits in - // size. FP16 is loaded/stored using i16, so it's handled - // here as well. - AllocSize = 4; - } - SDValue DeclareScalarParamOps[] = { - Chain, DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(AllocSize * 8, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), InFlag}; - Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, - DeclareScalarParamOps); - NeedAlign = false; - } - InFlag = Chain.getValue(1); + SmallVector<EVT, 16> VTs; + SmallVector<uint64_t, 16> Offsets; - // PTX Interoperability Guide 3.3(A): [Integer] Values shorter - // than 32-bits are sign extended or zero extended, depending on - // whether they are signed or unsigned types. This case applies - // only to scalar parameters and not to aggregate values. - bool ExtendIntegerParam = - Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; - - auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); - SmallVector<SDValue, 6> StoreOperands; - for (unsigned j = 0, je = VTs.size(); j != je; ++j) { - // New store. - if (VectorInfo[j] & PVF_FIRST) { - assert(StoreOperands.empty() && "Unfinished preceding store."); - StoreOperands.push_back(Chain); - StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); - StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); - } + assert((!IsByVal || Args[i].IndirectType) && + "byval arg must have indirect type"); + Type *ETy = (IsByVal ? Args[i].IndirectType : Ty); + ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets); + + Align ArgAlign; + if (IsByVal) { + // The ByValAlign in the Outs[OIdx].Flags is always set at this point, + // so we don't need to worry whether it's naturally aligned or not. + // See TargetLowering::LowerCallTo(). + ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); + + // Try to increase alignment to enhance vectorization options. + ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign( + CB->getCalledFunction(), ETy, DL)); + + // Enforce minumum alignment of 4 to work around ptxas miscompile + // for sm_50+. See corresponding alignment adjustment in + // emitFunctionParamList() for details. + ArgAlign = std::max(ArgAlign, Align(4)); + } else { + ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL); + } - EVT EltVT = VTs[j]; - SDValue StVal = OutVals[OIdx]; - if (ExtendIntegerParam) { - assert(VTs.size() == 1 && "Scalar can't have multiple parts."); - // zext/sext to i32 - StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND, - dl, MVT::i32, StVal); - } else if (EltVT.getSizeInBits() < 16) { - // Use 16-bit registers for small stores as it's the - // smallest general purpose register size supported by NVPTX. - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } + unsigned TypeSize = + (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty)); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - // Record the value to store. - StoreOperands.push_back(StVal); - - if (VectorInfo[j] & PVF_LAST) { - unsigned NumElts = StoreOperands.size() - 3; - NVPTXISD::NodeType Op; - switch (NumElts) { - case 1: - Op = NVPTXISD::StoreParam; - break; - case 2: - Op = NVPTXISD::StoreParamV2; - break; - case 4: - Op = NVPTXISD::StoreParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); - } + bool NeedAlign; // Does argument declaration specify alignment? + if (IsByVal || + (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128))) { + // declare .param .align <align> .b8 .param<n>[<size>]; + SDValue DeclareParamOps[] = { + Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), + DAG.getConstant(ParamCount, dl, MVT::i32), + DAG.getConstant(TypeSize, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps); + NeedAlign = true; + } else { + // declare .param .b<size> .param<n>; + if ((VT.isInteger() || VT.isFloatingPoint()) && TypeSize < 4) { + // PTX ABI requires integral types to be at least 32 bits in + // size. FP16 is loaded/stored using i16, so it's handled + // here as well. + TypeSize = 4; + } + SDValue DeclareScalarParamOps[] = { + Chain, DAG.getConstant(ParamCount, dl, MVT::i32), + DAG.getConstant(TypeSize * 8, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, + DeclareScalarParamOps); + NeedAlign = false; + } + InFlag = Chain.getValue(1); - StoreOperands.push_back(InFlag); + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter + // than 32-bits are sign extended or zero extended, depending on + // whether they are signed or unsigned types. This case applies + // only to scalar parameters and not to aggregate values. + bool ExtendIntegerParam = + Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; - // Adjust type of the store op if we've extended the scalar - // return value. - EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j]; - MaybeAlign EltAlign; - if (NeedAlign) - EltAlign = commonAlignment(ArgAlign, Offsets[j]); + auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); + SmallVector<SDValue, 6> StoreOperands; + for (unsigned j = 0, je = VTs.size(); j != je; ++j) { + EVT EltVT = VTs[j]; + int CurOffset = Offsets[j]; + MaybeAlign PartAlign; + if (NeedAlign) + PartAlign = commonAlignment(ArgAlign, CurOffset); + + // New store. + if (VectorInfo[j] & PVF_FIRST) { + assert(StoreOperands.empty() && "Unfinished preceding store."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(ParamCount, dl, MVT::i32)); + StoreOperands.push_back(DAG.getConstant(CurOffset, dl, MVT::i32)); + } - Chain = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, - TheStoreType, MachinePointerInfo(), EltAlign, - MachineMemOperand::MOStore); - InFlag = Chain.getValue(1); + SDValue StVal = OutVals[OIdx]; + if (IsByVal) { + auto PtrVT = getPointerTy(DL); + SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, + DAG.getConstant(CurOffset, dl, PtrVT)); + StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(), + PartAlign); + } else if (ExtendIntegerParam) { + assert(VTs.size() == 1 && "Scalar can't have multiple parts."); + // zext/sext to i32 + StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + dl, MVT::i32, StVal); + } - // Cleanup. - StoreOperands.clear(); - } - ++OIdx; + if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) { + // Use 16-bit registers for small stores as it's the + // smallest general purpose register size supported by NVPTX. + StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); } - assert(StoreOperands.empty() && "Unfinished parameter store."); - if (VTs.size() > 0) - --OIdx; - ++paramCount; - continue; - } - // ByVal arguments - SmallVector<EVT, 16> VTs; - SmallVector<uint64_t, 16> Offsets; - auto *PTy = dyn_cast<PointerType>(Args[i].Ty); - assert(PTy && "Type of a byval parameter should be pointer"); - ComputePTXValueVTs(*this, DL, PTy->getPointerElementType(), VTs, &Offsets, - 0); + // Record the value to store. + StoreOperands.push_back(StVal); - // declare .param .align <align> .b8 .param<n>[<size>]; - unsigned sz = Outs[OIdx].Flags.getByValSize(); - SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); - // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, - // so we don't need to worry about natural alignment or not. - // See TargetLowering::LowerCallTo(). - - // Enforce minumum alignment of 4 to work around ptxas miscompile - // for sm_50+. See corresponding alignment adjustment in - // emitFunctionParamList() for details. - if (ArgAlign < Align(4)) - ArgAlign = Align(4); - SDValue DeclareParamOps[] = { - Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(sz, dl, MVT::i32), InFlag}; - Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps); - InFlag = Chain.getValue(1); - for (unsigned j = 0, je = VTs.size(); j != je; ++j) { - EVT elemtype = VTs[j]; - int curOffset = Offsets[j]; - unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset); - auto PtrVT = getPointerTy(DL); - SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], - DAG.getConstant(curOffset, dl, PtrVT)); - SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, - MachinePointerInfo(), PartAlign); - if (elemtype.getSizeInBits() < 16) { - theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(curOffset, dl, MVT::i32), - theVal, InFlag }; - Chain = DAG.getMemIntrinsicNode( - NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype, - MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore); + if (VectorInfo[j] & PVF_LAST) { + unsigned NumElts = StoreOperands.size() - 3; + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreParam; + break; + case 2: + Op = NVPTXISD::StoreParamV2; + break; + case 4: + Op = NVPTXISD::StoreParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } - InFlag = Chain.getValue(1); + StoreOperands.push_back(InFlag); + + // Adjust type of the store op if we've extended the scalar + // return value. + EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; + + Chain = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, + TheStoreType, MachinePointerInfo(), PartAlign, + MachineMemOperand::MOStore); + InFlag = Chain.getValue(1); + + // Cleanup. + StoreOperands.clear(); + } + if (!IsByVal) + ++OIdx; } - ++paramCount; + assert(StoreOperands.empty() && "Unfinished parameter store."); + if (!IsByVal && VTs.size() > 0) + --OIdx; + ++ParamCount; } GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); @@ -1729,7 +1724,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallArgBeginOps); InFlag = Chain.getValue(1); - for (unsigned i = 0, e = paramCount; i != e; ++i) { + for (unsigned i = 0, e = ParamCount; i != e; ++i) { unsigned opcode; if (i == (e - 1)) opcode = NVPTXISD::LastCallArg; @@ -1865,7 +1860,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = Ret.getValue(1); InFlag = Ret.getValue(2); - if (ProxyRegTruncates[i].hasValue()) { + if (ProxyRegTruncates[i]) { Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret); } @@ -2249,7 +2244,7 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { assert(Node->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only"); SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), - LD->getPointerInfo(), LD->getAlignment(), + LD->getPointerInfo(), LD->getAlign(), LD->getMemOperand()->getFlags()); SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); // The legalizer (the caller) is expecting two values from the legalized @@ -2414,7 +2409,7 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, - ST->getAlignment(), ST->getMemOperand()->getFlags()); + ST->getAlign(), ST->getMemOperand()->getFlags()); return Result; } @@ -2431,29 +2426,6 @@ NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); } -// Check to see if the kernel argument is image*_t or sampler_t - -static bool isImageOrSamplerVal(const Value *arg, const Module *context) { - static const char *const specialTypes[] = { "struct._image2d_t", - "struct._image3d_t", - "struct._sampler_t" }; - - Type *Ty = arg->getType(); - auto *PTy = dyn_cast<PointerType>(Ty); - - if (!PTy) - return false; - - if (!context) - return false; - - auto *STy = dyn_cast<StructType>(PTy->getPointerElementType()); - if (!STy || STy->isLiteral()) - return false; - - return llvm::is_contained(specialTypes, STy->getName()); -} - SDValue NVPTXTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, @@ -2495,19 +2467,6 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { Type *Ty = argTypes[i]; - // If the kernel argument is image*_t or sampler_t, convert it to - // a i32 constant holding the parameter position. This can later - // matched in the AsmPrinter to output the correct mangled name. - if (isImageOrSamplerVal( - theArgs[i], - (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() - : nullptr))) { - assert(isKernelFunction(*F) && - "Only kernels can have image/sampler params"); - InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); - continue; - } - if (theArgs[i]->use_empty()) { // argument is dead if (Ty->isAggregateType() || Ty->isIntegerTy(128)) { @@ -2658,7 +2617,8 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); + const MachineFunction &MF = DAG.getMachineFunction(); + const Function &F = MF.getFunction(); Type *RetTy = MF.getFunction().getReturnType(); bool isABI = (STI.getSmVersion() >= 20); @@ -2673,7 +2633,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); auto VectorInfo = VectorizePTXValueVTs( - VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1)); + VTs, Offsets, + RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) + : Align(1)); // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than // 32-bits are sign extended or zero extended, depending on whether @@ -4293,6 +4255,26 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return false; } +/// getFunctionParamOptimizedAlign - since function arguments are passed via +/// .param space, we may want to increase their alignment in a way that +/// ensures that we can effectively vectorize their loads & stores. We can +/// increase alignment only if the function has internal or has private +/// linkage as for other linkage types callers may already rely on default +/// alignment. To allow using 128-bit vectorized loads/stores, this function +/// ensures that alignment is 16 or greater. +Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( + const Function *F, Type *ArgTy, const DataLayout &DL) const { + const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); + + // If a function has linkage different from internal or private, we + // must use default ABI alignment as external users rely on it. + if (!F->hasLocalLinkage()) + return Align(ABITypeAlign); + + assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); + return Align(std::max(uint64_t(16), ABITypeAlign)); +} + /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. /// Used to guide target specific optimizations, like loop strength reduction @@ -4516,6 +4498,17 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, return SDValue(); } +static SDValue PerformStoreRetvalCombine(SDNode *N) { + // Operands from the 2nd to the last one are the values to be stored + for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I) + if (!N->getOperand(I).isUndef()) + return SDValue(); + + // Operand 0 is the previous value in the chain. Cannot return EntryToken + // as the previous value will become unused and eliminated later. + return N->getOperand(0); +} + /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, @@ -4844,6 +4837,10 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformREMCombine(N, DCI, OptLevel); case ISD::SETCC: return PerformSETCCCombine(N, DCI); + case NVPTXISD::StoreRetval: + case NVPTXISD::StoreRetvalV2: + case NVPTXISD::StoreRetvalV4: + return PerformStoreRetvalCombine(N); } return SDValue(); } @@ -5130,8 +5127,69 @@ void NVPTXTargetLowering::ReplaceNodeResults( } } +NVPTXTargetLowering::AtomicExpansionKind +NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + Type *Ty = AI->getValOperand()->getType(); + + if (AI->isFloatingPointOperation()) { + if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { + if (Ty->isFloatTy()) + return AtomicExpansionKind::None; + if (Ty->isDoubleTy() && STI.hasAtomAddF64()) + return AtomicExpansionKind::None; + } + return AtomicExpansionKind::CmpXChg; + } + + assert(Ty->isIntegerTy() && "Ty should be integer at this point"); + auto ITy = cast<llvm::IntegerType>(Ty); + + switch (AI->getOperation()) { + default: + return AtomicExpansionKind::CmpXChg; + case AtomicRMWInst::BinOp::And: + case AtomicRMWInst::BinOp::Or: + case AtomicRMWInst::BinOp::Xor: + case AtomicRMWInst::BinOp::Xchg: + switch (ITy->getBitWidth()) { + case 8: + case 16: + return AtomicExpansionKind::CmpXChg; + case 32: + return AtomicExpansionKind::None; + case 64: + if (STI.hasAtomBitwise64()) + return AtomicExpansionKind::None; + return AtomicExpansionKind::CmpXChg; + default: + llvm_unreachable("unsupported width encountered"); + } + case AtomicRMWInst::BinOp::Add: + case AtomicRMWInst::BinOp::Sub: + case AtomicRMWInst::BinOp::Max: + case AtomicRMWInst::BinOp::Min: + case AtomicRMWInst::BinOp::UMax: + case AtomicRMWInst::BinOp::UMin: + switch (ITy->getBitWidth()) { + case 8: + case 16: + return AtomicExpansionKind::CmpXChg; + case 32: + return AtomicExpansionKind::None; + case 64: + if (STI.hasAtomMinMax64()) + return AtomicExpansionKind::None; + return AtomicExpansionKind::CmpXChg; + default: + llvm_unreachable("unsupported width encountered"); + } + } + + return AtomicExpansionKind::CmpXChg; +} + // Pin NVPTXTargetObjectFile's vtables to this file. -NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {} +NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { |