src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-09-02 21:17:18 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-12-08 17:34:50 +0000
commit	06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e (patch)
tree	62f873df87c7c675557a179e0c4c83fe9f3087bc /contrib/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp
parent	cf037972ea8863e2bab7461d77345367d2c1e054 (diff)
parent	7fa27ce4a07f19b07799a767fc29416f3b625afb (diff)

Diffstat (limited to 'contrib/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp')

-rw-r--r--

contrib/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp

601

1 files changed, 601 insertions, 0 deletions

diff --git a/contrib/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp b/contrib/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp
new file mode 100644
index 000000000000..796a2be81a09
--- /dev/null
+++ b/contrib/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp

@@ -0,0 +1,601 @@

+//===- AMDGPU.cpp ---------------------------------------------------------===//

+//

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+//

+//===----------------------------------------------------------------------===//

+#include "ABIInfoImpl.h"

+#include "TargetInfo.h"

+using namespace clang;

+using namespace clang::CodeGen;

+//===----------------------------------------------------------------------===//

+// AMDGPU ABI Implementation

+//===----------------------------------------------------------------------===//

+namespace {

+class AMDGPUABIInfo final : public DefaultABIInfo {

+private:

+ static const unsigned MaxNumRegsForArgsRet = 16;

+ unsigned numRegsForType(QualType Ty) const;

+ bool isHomogeneousAggregateBaseType(QualType Ty) const override;

+ bool isHomogeneousAggregateSmallEnough(const Type *Base,

+ uint64_t Members) const override;

+ // Coerce HIP scalar pointer arguments from generic pointers to global ones.

+ llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,

+ unsigned ToAS) const {

+ // Single value types.

+ auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);

+ if (PtrTy && PtrTy->getAddressSpace() == FromAS)

+ return llvm::PointerType::get(Ty->getContext(), ToAS);

+ return Ty;

+ }

+public:

+ explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :

+ DefaultABIInfo(CGT) {}

+ ABIArgInfo classifyReturnType(QualType RetTy) const;

+ ABIArgInfo classifyKernelArgumentType(QualType Ty) const;

+ ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;

+ void computeInfo(CGFunctionInfo &FI) const override;

+ Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,

+ QualType Ty) const override;

+};

+bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {

+ return true;

+bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(

+ const Type *Base, uint64_t Members) const {

+ uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;

+ // Homogeneous Aggregates may occupy at most 16 registers.

+ return Members * NumRegs <= MaxNumRegsForArgsRet;

+/// Estimate number of registers the type will use when passed in registers.

+unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {

+ unsigned NumRegs = 0;

+ if (const VectorType *VT = Ty->getAs<VectorType>()) {

+ // Compute from the number of elements. The reported size is based on the

+ // in-memory size, which includes the padding 4th element for 3-vectors.

+ QualType EltTy = VT->getElementType();

+ unsigned EltSize = getContext().getTypeSize(EltTy);

+ // 16-bit element vectors should be passed as packed.

+ if (EltSize == 16)

+ return (VT->getNumElements() + 1) / 2;

+ unsigned EltNumRegs = (EltSize + 31) / 32;

+ return EltNumRegs * VT->getNumElements();

+ }

+ if (const RecordType *RT = Ty->getAs<RecordType>()) {

+ const RecordDecl *RD = RT->getDecl();

+ assert(!RD->hasFlexibleArrayMember());

+ for (const FieldDecl *Field : RD->fields()) {

+ QualType FieldTy = Field->getType();

+ NumRegs += numRegsForType(FieldTy);

+ }

+ return NumRegs;

+ }

+ return (getContext().getTypeSize(Ty) + 31) / 32;

+void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {

+ llvm::CallingConv::ID CC = FI.getCallingConvention();

+ if (!getCXXABI().classifyReturnType(FI))

+ FI.getReturnInfo() = classifyReturnType(FI.getReturnType());

+ unsigned NumRegsLeft = MaxNumRegsForArgsRet;

+ for (auto &Arg : FI.arguments()) {

+ if (CC == llvm::CallingConv::AMDGPU_KERNEL) {

+ Arg.info = classifyKernelArgumentType(Arg.type);

+ } else {

+ Arg.info = classifyArgumentType(Arg.type, NumRegsLeft);

+ }

+Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,

+ QualType Ty) const {

+ llvm_unreachable("AMDGPU does not support varargs");

+ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {

+ if (isAggregateTypeForABI(RetTy)) {

+ // Records with non-trivial destructors/copy-constructors should not be

+ // returned by value.

+ if (!getRecordArgABI(RetTy, getCXXABI())) {

+ // Ignore empty structs/unions.

+ if (isEmptyRecord(getContext(), RetTy, true))

+ return ABIArgInfo::getIgnore();

+ // Lower single-element structs to just return a regular value.

+ if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))

+ return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));

+ if (const RecordType *RT = RetTy->getAs<RecordType>()) {

+ const RecordDecl *RD = RT->getDecl();

+ if (RD->hasFlexibleArrayMember())

+ return DefaultABIInfo::classifyReturnType(RetTy);

+ }

+ // Pack aggregates <= 4 bytes into single VGPR or pair.

+ uint64_t Size = getContext().getTypeSize(RetTy);

+ if (Size <= 16)

+ return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));

+ if (Size <= 32)

+ return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));

+ if (Size <= 64) {

+ llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());

+ return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));

+ }

+ if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)

+ return ABIArgInfo::getDirect();

+ }

+ // Otherwise just do the default thing.

+ return DefaultABIInfo::classifyReturnType(RetTy);

+/// For kernels all parameters are really passed in a special buffer. It doesn't

+/// make sense to pass anything byval, so everything must be direct.

+ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {

+ Ty = useFirstFieldIfTransparentUnion(Ty);

+ // TODO: Can we omit empty structs?

+ if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))

+ Ty = QualType(SeltTy, 0);

+ llvm::Type *OrigLTy = CGT.ConvertType(Ty);

+ llvm::Type *LTy = OrigLTy;

+ if (getContext().getLangOpts().HIP) {

+ LTy = coerceKernelArgumentType(

+ OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),

+ /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));

+ }

+ // FIXME: Should also use this for OpenCL, but it requires addressing the

+ // problem of kernels being called.

+ //

+ // FIXME: This doesn't apply the optimization of coercing pointers in structs

+ // to global address space when using byref. This would require implementing a

+ // new kind of coercion of the in-memory type when for indirect arguments.

+ if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&

+ isAggregateTypeForABI(Ty)) {

+ return ABIArgInfo::getIndirectAliased(

+ getContext().getTypeAlignInChars(Ty),

+ getContext().getTargetAddressSpace(LangAS::opencl_constant),

+ false /*Realign*/, nullptr /*Padding*/);

+ }

+ // If we set CanBeFlattened to true, CodeGen will expand the struct to its

+ // individual elements, which confuses the Clover OpenCL backend; therefore we

+ // have to set it to false here. Other args of getDirect() are just defaults.

+ return ABIArgInfo::getDirect(LTy, 0, nullptr, false);

+ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,

+ unsigned &NumRegsLeft) const {

+ assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");

+ Ty = useFirstFieldIfTransparentUnion(Ty);

+ if (isAggregateTypeForABI(Ty)) {

+ // Records with non-trivial destructors/copy-constructors should not be

+ // passed by value.

+ if (auto RAA = getRecordArgABI(Ty, getCXXABI()))

+ return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);

+ // Ignore empty structs/unions.

+ if (isEmptyRecord(getContext(), Ty, true))

+ return ABIArgInfo::getIgnore();

+ // Lower single-element structs to just pass a regular value. TODO: We

+ // could do reasonable-size multiple-element structs too, using getExpand(),

+ // though watch out for things like bitfields.

+ if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))

+ return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));

+ if (const RecordType *RT = Ty->getAs<RecordType>()) {

+ const RecordDecl *RD = RT->getDecl();

+ if (RD->hasFlexibleArrayMember())

+ return DefaultABIInfo::classifyArgumentType(Ty);

+ }

+ // Pack aggregates <= 8 bytes into single VGPR or pair.

+ uint64_t Size = getContext().getTypeSize(Ty);

+ if (Size <= 64) {

+ unsigned NumRegs = (Size + 31) / 32;

+ NumRegsLeft -= std::min(NumRegsLeft, NumRegs);

+ if (Size <= 16)

+ return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));

+ if (Size <= 32)

+ return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));

+ // XXX: Should this be i64 instead, and should the limit increase?

+ llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());

+ return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));

+ }

+ if (NumRegsLeft > 0) {

+ unsigned NumRegs = numRegsForType(Ty);

+ if (NumRegsLeft >= NumRegs) {

+ NumRegsLeft -= NumRegs;

+ return ABIArgInfo::getDirect();

+ }

+ // Otherwise just do the default thing.

+ ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);

+ if (!ArgInfo.isIndirect()) {

+ unsigned NumRegs = numRegsForType(Ty);

+ NumRegsLeft -= std::min(NumRegs, NumRegsLeft);

+ }

+ return ArgInfo;

+class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {

+public:

+ AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)

+ : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}

+ void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,

+ CodeGenModule &CGM) const;

+ void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,

+ CodeGen::CodeGenModule &M) const override;

+ unsigned getOpenCLKernelCallingConv() const override;

+ llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,

+ llvm::PointerType *T, QualType QT) const override;

+ LangAS getASTAllocaAddressSpace() const override {

+ return getLangASFromTargetAS(

+ getABIInfo().getDataLayout().getAllocaAddrSpace());

+ }

+ LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,

+ const VarDecl *D) const override;

+ llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,

+ SyncScope Scope,

+ llvm::AtomicOrdering Ordering,

+ llvm::LLVMContext &Ctx) const override;

+ llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,

+ llvm::Function *BlockInvokeFunc,

+ llvm::Type *BlockTy) const override;

+ bool shouldEmitStaticExternCAliases() const override;

+ bool shouldEmitDWARFBitFieldSeparators() const override;

+ void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;

+};

+static bool requiresAMDGPUProtectedVisibility(const Decl *D,

+ llvm::GlobalValue *GV) {

+ if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)

+ return false;

+ return D->hasAttr<OpenCLKernelAttr>() ||

+ (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||

+ (isa<VarDecl>(D) &&

+ (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||

+ cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||

+ cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()));

+void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(

+ const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {

+ const auto *ReqdWGS =

+ M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;

+ const bool IsOpenCLKernel =

+ M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();

+ const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();

+ const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();

+ if (ReqdWGS || FlatWGS) {

+ unsigned Min = 0;

+ unsigned Max = 0;

+ if (FlatWGS) {

+ Min = FlatWGS->getMin()

+ ->EvaluateKnownConstInt(M.getContext())

+ .getExtValue();

+ Max = FlatWGS->getMax()

+ ->EvaluateKnownConstInt(M.getContext())

+ .getExtValue();

+ }

+ if (ReqdWGS && Min == 0 && Max == 0)

+ Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();

+ if (Min != 0) {

+ assert(Min <= Max && "Min must be less than or equal Max");

+ std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);

+ F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);

+ } else

+ assert(Max == 0 && "Max must be zero");

+ } else if (IsOpenCLKernel || IsHIPKernel) {

+ // By default, restrict the maximum size to a value specified by

+ // --gpu-max-threads-per-block=n or its default value for HIP.

+ const unsigned OpenCLDefaultMaxWorkGroupSize = 256;

+ const unsigned DefaultMaxWorkGroupSize =

+ IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize

+ : M.getLangOpts().GPUMaxThreadsPerBlock;

+ std::string AttrVal =

+ std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);

+ F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);

+ }

+ if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) {

+ unsigned Min =

+ Attr->getMin()->EvaluateKnownConstInt(M.getContext()).getExtValue();

+ unsigned Max = Attr->getMax() ? Attr->getMax()

+ ->EvaluateKnownConstInt(M.getContext())

+ .getExtValue()

+ : 0;

+ if (Min != 0) {

+ assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");

+ std::string AttrVal = llvm::utostr(Min);

+ if (Max != 0)

+ AttrVal = AttrVal + "," + llvm::utostr(Max);

+ F->addFnAttr("amdgpu-waves-per-eu", AttrVal);

+ } else

+ assert(Max == 0 && "Max must be zero");

+ }

+ if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {

+ unsigned NumSGPR = Attr->getNumSGPR();

+ if (NumSGPR != 0)

+ F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));

+ }

+ if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {

+ uint32_t NumVGPR = Attr->getNumVGPR();

+ if (NumVGPR != 0)

+ F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));

+ }

+void AMDGPUTargetCodeGenInfo::setTargetAttributes(

+ const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {

+ if (requiresAMDGPUProtectedVisibility(D, GV)) {

+ GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);

+ GV->setDSOLocal(true);

+ }

+ if (GV->isDeclaration())

+ return;

+ llvm::Function *F = dyn_cast<llvm::Function>(GV);

+ if (!F)

+ return;

+ const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);

+ if (FD)

+ setFunctionDeclAttributes(FD, F, M);

+ const bool IsHIPKernel =

+ M.getLangOpts().HIP && FD && FD->hasAttr<CUDAGlobalAttr>();

+ // TODO: This should be moved to language specific attributes instead.

+ if (IsHIPKernel)

+ F->addFnAttr("uniform-work-group-size", "true");

+ if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())

+ F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");

+ if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)

+ F->addFnAttr("amdgpu-ieee", "false");

+unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {

+ return llvm::CallingConv::AMDGPU_KERNEL;

+// Currently LLVM assumes null pointers always have value 0,

+// which results in incorrectly transformed IR. Therefore, instead of

+// emitting null pointers in private and local address spaces, a null

+// pointer in generic address space is emitted which is casted to a

+// pointer in local or private address space.

+llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(

+ const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,

+ QualType QT) const {

+ if (CGM.getContext().getTargetNullPointerValue(QT) == 0)

+ return llvm::ConstantPointerNull::get(PT);

+ auto &Ctx = CGM.getContext();

+ auto NPT = llvm::PointerType::get(

+ PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));

+ return llvm::ConstantExpr::getAddrSpaceCast(

+ llvm::ConstantPointerNull::get(NPT), PT);

+LangAS

+AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,

+ const VarDecl *D) const {

+ assert(!CGM.getLangOpts().OpenCL &&

+ !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&

+ "Address space agnostic languages only");

+ LangAS DefaultGlobalAS = getLangASFromTargetAS(

+ CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));

+ if (!D)

+ return DefaultGlobalAS;

+ LangAS AddrSpace = D->getType().getAddressSpace();

+ assert(AddrSpace == LangAS::Default || isTargetAddressSpace(AddrSpace));

+ if (AddrSpace != LangAS::Default)

+ return AddrSpace;

+ // Only promote to address space 4 if VarDecl has constant initialization.

+ if (CGM.isTypeConstant(D->getType(), false, false) &&

+ D->hasConstantInitialization()) {

+ if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())

+ return *ConstAS;

+ }

+ return DefaultGlobalAS;

+llvm::SyncScope::ID

+AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,

+ SyncScope Scope,

+ llvm::AtomicOrdering Ordering,

+ llvm::LLVMContext &Ctx) const {

+ std::string Name;

+ switch (Scope) {

+ case SyncScope::HIPSingleThread:

+ Name = "singlethread";

+ break;

+ case SyncScope::HIPWavefront:

+ case SyncScope::OpenCLSubGroup:

+ Name = "wavefront";

+ break;

+ case SyncScope::HIPWorkgroup:

+ case SyncScope::OpenCLWorkGroup:

+ Name = "workgroup";

+ break;

+ case SyncScope::HIPAgent:

+ case SyncScope::OpenCLDevice:

+ Name = "agent";

+ break;

+ case SyncScope::HIPSystem:

+ case SyncScope::OpenCLAllSVMDevices:

+ Name = "";

+ break;

+ }

+ if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {

+ if (!Name.empty())

+ Name = Twine(Twine(Name) + Twine("-")).str();

+ Name = Twine(Twine(Name) + Twine("one-as")).str();

+ }

+ return Ctx.getOrInsertSyncScopeID(Name);

+bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {

+ return false;

+bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {

+ return true;

+void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(

+ const FunctionType *&FT) const {

+ FT = getABIInfo().getContext().adjustFunctionType(

+ FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));

+/// Create an OpenCL kernel for an enqueued block.

+///

+/// The type of the first argument (the block literal) is the struct type

+/// of the block literal instead of a pointer type. The first argument

+/// (block literal) is passed directly by value to the kernel. The kernel

+/// allocates the same type of struct on stack and stores the block literal

+/// to it and passes its pointer to the block invoke function. The kernel

+/// has "enqueued-block" function attribute and kernel argument metadata.

+llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(

+ CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {

+ auto &Builder = CGF.Builder;

+ auto &C = CGF.getLLVMContext();

+ auto *InvokeFT = Invoke->getFunctionType();

+ llvm::SmallVector<llvm::Type *, 2> ArgTys;

+ llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;

+ llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;

+ llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;

+ llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;

+ llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;

+ llvm::SmallVector<llvm::Metadata *, 8> ArgNames;

+ ArgTys.push_back(BlockTy);

+ ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));

+ AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));

+ ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));

+ ArgTypeQuals.push_back(llvm::MDString::get(C, ""));

+ AccessQuals.push_back(llvm::MDString::get(C, "none"));

+ ArgNames.push_back(llvm::MDString::get(C, "block_literal"));

+ for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {

+ ArgTys.push_back(InvokeFT->getParamType(I));

+ ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));

+ AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));

+ AccessQuals.push_back(llvm::MDString::get(C, "none"));

+ ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));

+ ArgTypeQuals.push_back(llvm::MDString::get(C, ""));

+ ArgNames.push_back(

+ llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));

+ }

+ std::string Name = Invoke->getName().str() + "_kernel";

+ auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);

+ auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,

+ &CGF.CGM.getModule());

+ F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);

+ llvm::AttrBuilder KernelAttrs(C);

+ // FIXME: The invoke isn't applying the right attributes either

+ // FIXME: This is missing setTargetAttributes

+ CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);

+ KernelAttrs.addAttribute("enqueued-block");

+ F->addFnAttrs(KernelAttrs);

+ auto IP = CGF.Builder.saveIP();

+ auto *BB = llvm::BasicBlock::Create(C, "entry", F);

+ Builder.SetInsertPoint(BB);

+ const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);

+ auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);

+ BlockPtr->setAlignment(BlockAlign);

+ Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);

+ auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));

+ llvm::SmallVector<llvm::Value *, 2> Args;

+ Args.push_back(Cast);

+ for (llvm::Argument &A : llvm::drop_begin(F->args()))

+ Args.push_back(&A);

+ llvm::CallInst *call = Builder.CreateCall(Invoke, Args);

+ call->setCallingConv(Invoke->getCallingConv());

+ Builder.CreateRetVoid();

+ Builder.restoreIP(IP);

+ F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));

+ F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));

+ F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));

+ F->setMetadata("kernel_arg_base_type",

+ llvm::MDNode::get(C, ArgBaseTypeNames));

+ F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));

+ if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)

+ F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));

+ return F;

+std::unique_ptr<TargetCodeGenInfo>

+CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {

+ return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());