diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp | 250 |
1 files changed, 250 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp new file mode 100644 index 000000000000..c34c12ab9fec --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -0,0 +1,250 @@ +//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This pass replaces accesses to kernel arguments with loads from +/// offsets from the kernarg base pointer. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/Target/TargetMachine.h" +#define DEBUG_TYPE "amdgpu-lower-kernel-arguments" + +using namespace llvm; + +namespace { + +class AMDGPULowerKernelArguments : public FunctionPass{ +public: + static char ID; + + AMDGPULowerKernelArguments() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesAll(); + } +}; + +} // end anonymous namespace + +// skip allocas +static BasicBlock::iterator getInsertPt(BasicBlock &BB) { + BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); + for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { + AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt); + + // If this is a dynamic alloca, the value may depend on the loaded kernargs, + // so loads will need to be inserted before it. + if (!AI || !AI->isStaticAlloca()) + break; + } + + return InsPt; +} + +bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { + CallingConv::ID CC = F.getCallingConv(); + if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) + return false; + + auto &TPC = getAnalysis<TargetPassConfig>(); + + const TargetMachine &TM = TPC.getTM<TargetMachine>(); + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + LLVMContext &Ctx = F.getParent()->getContext(); + const DataLayout &DL = F.getParent()->getDataLayout(); + BasicBlock &EntryBlock = *F.begin(); + IRBuilder<> Builder(&*getInsertPt(EntryBlock)); + + const Align KernArgBaseAlign(16); // FIXME: Increase if necessary + const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); + + Align MaxAlign; + // FIXME: Alignment is broken broken with explicit arg offset.; + const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); + if (TotalKernArgSize == 0) + return false; + + CallInst *KernArgSegment = + Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {}, + nullptr, F.getName() + ".kernarg.segment"); + + KernArgSegment->addRetAttr(Attribute::NonNull); + KernArgSegment->addRetAttr( + Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); + + unsigned AS = KernArgSegment->getType()->getPointerAddressSpace(); + uint64_t ExplicitArgOffset = 0; + + for (Argument &Arg : F.args()) { + const bool IsByRef = Arg.hasByRefAttr(); + Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); + MaybeAlign ABITypeAlign = IsByRef ? Arg.getParamAlign() : None; + if (!ABITypeAlign) + ABITypeAlign = DL.getABITypeAlign(ArgTy); + + uint64_t Size = DL.getTypeSizeInBits(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); + + uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; + + if (Arg.use_empty()) + continue; + + // If this is byval, the loads are already explicit in the function. We just + // need to rewrite the pointer values. + if (IsByRef) { + Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64( + Builder.getInt8Ty(), KernArgSegment, EltOffset, + Arg.getName() + ".byval.kernarg.offset"); + + Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + ArgOffsetPtr, Arg.getType()); + Arg.replaceAllUsesWith(CastOffsetPtr); + continue; + } + + if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) { + // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing + // modes on SI to know the high bits are 0 so pointer adds don't wrap. We + // can't represent this with range metadata because it's only allowed for + // integer types. + if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && + !ST.hasUsableDSOffset()) + continue; + + // FIXME: We can replace this with equivalent alias.scope/noalias + // metadata, but this appears to be a lot of work. + if (Arg.hasNoAliasAttr()) + continue; + } + + auto *VT = dyn_cast<FixedVectorType>(ArgTy); + bool IsV3 = VT && VT->getNumElements() == 3; + bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); + + VectorType *V4Ty = nullptr; + + int64_t AlignDownOffset = alignDown(EltOffset, 4); + int64_t OffsetDiff = EltOffset - AlignDownOffset; + Align AdjustedAlign = commonAlignment( + KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset); + + Value *ArgPtr; + Type *AdjustedArgTy; + if (DoShiftOpt) { // FIXME: Handle aggregate types + // Since we don't have sub-dword scalar loads, avoid doing an extload by + // loading earlier than the argument address, and extracting the relevant + // bits. + // + // Additionally widen any sub-dword load to i32 even if suitably aligned, + // so that CSE between different argument loads works easily. + ArgPtr = Builder.CreateConstInBoundsGEP1_64( + Builder.getInt8Ty(), KernArgSegment, AlignDownOffset, + Arg.getName() + ".kernarg.offset.align.down"); + AdjustedArgTy = Builder.getInt32Ty(); + } else { + ArgPtr = Builder.CreateConstInBoundsGEP1_64( + Builder.getInt8Ty(), KernArgSegment, EltOffset, + Arg.getName() + ".kernarg.offset"); + AdjustedArgTy = ArgTy; + } + + if (IsV3 && Size >= 32) { + V4Ty = FixedVectorType::get(VT->getElementType(), 4); + // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads + AdjustedArgTy = V4Ty; + } + + ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), + ArgPtr->getName() + ".cast"); + LoadInst *Load = + Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); + Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); + + MDBuilder MDB(Ctx); + + if (isa<PointerType>(ArgTy)) { + if (Arg.hasNonNullAttr()) + Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {})); + + uint64_t DerefBytes = Arg.getDereferenceableBytes(); + if (DerefBytes != 0) { + Load->setMetadata( + LLVMContext::MD_dereferenceable, + MDNode::get(Ctx, + MDB.createConstant( + ConstantInt::get(Builder.getInt64Ty(), DerefBytes)))); + } + + uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes(); + if (DerefOrNullBytes != 0) { + Load->setMetadata( + LLVMContext::MD_dereferenceable_or_null, + MDNode::get(Ctx, + MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), + DerefOrNullBytes)))); + } + + unsigned ParamAlign = Arg.getParamAlignment(); + if (ParamAlign != 0) { + Load->setMetadata( + LLVMContext::MD_align, + MDNode::get(Ctx, + MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), + ParamAlign)))); + } + } + + // TODO: Convert noalias arg to !noalias + + if (DoShiftOpt) { + Value *ExtractBits = OffsetDiff == 0 ? + Load : Builder.CreateLShr(Load, OffsetDiff * 8); + + IntegerType *ArgIntTy = Builder.getIntNTy(Size); + Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy); + Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy, + Arg.getName() + ".load"); + Arg.replaceAllUsesWith(NewVal); + } else if (IsV3) { + Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2}, + Arg.getName() + ".load"); + Arg.replaceAllUsesWith(Shuf); + } else { + Load->setName(Arg.getName() + ".load"); + Arg.replaceAllUsesWith(Load); + } + } + + KernArgSegment->addRetAttr( + Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); + + return true; +} + +INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, + "AMDGPU Lower Kernel Arguments", false, false) +INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", + false, false) + +char AMDGPULowerKernelArguments::ID = 0; + +FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() { + return new AMDGPULowerKernelArguments(); +} |
