aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp')
-rw-r--r--contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp110
1 files changed, 62 insertions, 48 deletions
diff --git a/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp b/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
index 5b43272bfa62..43dfbbb90dd5 100644
--- a/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
@@ -71,8 +71,6 @@ private:
bool RelocatableDeviceCode;
/// Mangle context for device.
std::unique_ptr<MangleContext> DeviceMC;
- /// Some zeros used for GEPs.
- llvm::Constant *Zeros[2];
llvm::FunctionCallee getSetupArgumentFn() const;
llvm::FunctionCallee getLaunchFn() const;
@@ -91,9 +89,7 @@ private:
/// where the C code specifies const char*.
llvm::Constant *makeConstantString(const std::string &Str,
const std::string &Name = "") {
- auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
- return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
- ConstStr.getPointer(), Zeros);
+ return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer();
}
/// Helper function which generates an initialized constant array from Str,
@@ -117,7 +113,7 @@ private:
}
if (Alignment)
GV->setAlignment(llvm::Align(Alignment));
- return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
+ return GV;
}
/// Helper function that generates an empty dummy function returning void.
@@ -230,8 +226,6 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
IntTy = CGM.IntTy;
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
- Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
- Zeros[1] = Zeros[0];
PtrTy = CGM.UnqualPtrTy;
}
@@ -331,11 +325,11 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
// Store pointers to the arguments in a locally allocated launch_args.
for (unsigned i = 0; i < Args.size(); ++i) {
- llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
+ llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF);
llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
CGF.Builder.CreateDefaultAlignedStore(
- VoidVarPtr,
- CGF.Builder.CreateConstGEP1_32(PtrTy, KernelArgs.getPointer(), i));
+ VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
+ PtrTy, KernelArgs.emitRawPointer(CGF), i));
}
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
@@ -361,7 +355,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
}
auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
- IdentifierInfo &cudaLaunchKernelII =
+ const IdentifierInfo &cudaLaunchKernelII =
CGM.getContext().Idents.get(LaunchKernelName);
FunctionDecl *cudaLaunchKernelFD = nullptr;
for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
@@ -393,9 +387,10 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
/*isVarArg=*/false),
addUnderscoredPrefixToName("PopCallConfiguration"));
- CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
- {GridDim.getPointer(), BlockDim.getPointer(),
- ShmemSize.getPointer(), Stream.getPointer()});
+ CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF),
+ BlockDim.emitRawPointer(CGF),
+ ShmemSize.emitRawPointer(CGF),
+ Stream.emitRawPointer(CGF)});
// Emit the call to cudaLaunch
llvm::Value *Kernel =
@@ -405,7 +400,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
cudaLaunchKernelFD->getParamDecl(0)->getType());
LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
- LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
+ LaunchKernelArgs.add(RValue::get(KernelArgs, CGF),
cudaLaunchKernelFD->getParamDecl(3)->getType());
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
cudaLaunchKernelFD->getParamDecl(4)->getType());
@@ -423,6 +418,33 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
LaunchKernelArgs);
+
+ // To prevent CUDA device stub functions from being merged by ICF in MSVC
+ // environment, create an unique global variable for each kernel and write to
+ // the variable in the device stub.
+ if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+ !CGF.getLangOpts().HIP) {
+ llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
+ std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
+
+ llvm::GlobalVariable *HandleVar =
+ CGM.getModule().getNamedGlobal(GlobalVarName);
+ if (!HandleVar) {
+ HandleVar = new llvm::GlobalVariable(
+ CGM.getModule(), CGM.Int8Ty,
+ /*Constant=*/false, KernelFunction->getLinkage(),
+ llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
+ HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
+ HandleVar->setVisibility(KernelFunction->getVisibility());
+ if (KernelFunction->hasComdat())
+ HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
+ }
+
+ CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1),
+ HandleVar, CharUnits::One(),
+ /*IsVolatile=*/true);
+ }
+
CGF.EmitBranch(EndBlock);
CGF.EmitBlock(EndBlock);
@@ -438,8 +460,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
Offset = Offset.alignTo(TInfo.Align);
llvm::Value *Args[] = {
- CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
- PtrTy),
+ CGF.Builder.CreatePointerCast(
+ CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy),
llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
};
@@ -491,7 +513,8 @@ static void replaceManagedVar(llvm::GlobalVariable *Var,
// variable with instructions.
for (auto &&Op : WorkItem) {
auto *CE = cast<llvm::ConstantExpr>(Op);
- auto *NewInst = CE->getAsInstruction(I);
+ auto *NewInst = CE->getAsInstruction();
+ NewInst->insertBefore(*I->getParent(), I->getIterator());
NewInst->replaceUsesOfWith(OldV, NewV);
OldV = CE;
NewV = NewInst;
@@ -604,20 +627,10 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
uint64_t VarSize =
CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
if (Info.Flags.isManaged()) {
- auto *ManagedVar = new llvm::GlobalVariable(
- CGM.getModule(), Var->getType(),
- /*isConstant=*/false, Var->getLinkage(),
- /*Init=*/Var->isDeclaration()
- ? nullptr
- : llvm::ConstantPointerNull::get(Var->getType()),
- /*Name=*/"", /*InsertBefore=*/nullptr,
- llvm::GlobalVariable::NotThreadLocal);
- ManagedVar->setDSOLocal(Var->isDSOLocal());
- ManagedVar->setVisibility(Var->getVisibility());
- ManagedVar->setExternallyInitialized(true);
- ManagedVar->takeName(Var);
- Var->setName(Twine(ManagedVar->getName() + ".managed"));
- replaceManagedVar(Var, ManagedVar);
+ assert(Var->getName().ends_with(".managed") &&
+ "HIP managed variables not transformed");
+ auto *ManagedVar = CGM.getModule().getNamedGlobal(
+ Var->getName().drop_back(StringRef(".managed").size()));
llvm::Value *Args[] = {
&GpuBinaryHandlePtr,
ManagedVar,
@@ -760,10 +773,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// to contain the fat binary but will be populated somewhere else,
// e.g. by lld through link script.
FatBinStr = new llvm::GlobalVariable(
- CGM.getModule(), CGM.Int8Ty,
- /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
- "__hip_fatbin", nullptr,
- llvm::GlobalVariable::NotThreadLocal);
+ CGM.getModule(), CGM.Int8Ty,
+ /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
+ "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
+ llvm::GlobalVariable::NotThreadLocal);
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
}
@@ -816,8 +829,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// thread safety of the loaded program. Therefore we can assume sequential
// execution of constructor functions here.
if (IsHIP) {
- auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
- llvm::GlobalValue::LinkOnceAnyLinkage;
+ auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
+ : llvm::GlobalValue::ExternalLinkage;
llvm::BasicBlock *IfBlock =
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
llvm::BasicBlock *ExitBlock =
@@ -826,11 +839,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
// of HIP ABI.
GpuBinaryHandle = new llvm::GlobalVariable(
TheModule, PtrTy, /*isConstant=*/false, Linkage,
- /*Initializer=*/llvm::ConstantPointerNull::get(PtrTy),
- "__hip_gpubin_handle");
- if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
- GpuBinaryHandle->setComdat(
- CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
+ /*Initializer=*/
+ CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
+ CudaGpuBinary
+ ? "__hip_gpubin_handle"
+ : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
// Prevent the weak symbol in different shared libraries being merged.
if (Linkage != llvm::GlobalValue::InternalLinkage)
@@ -1092,7 +1105,9 @@ void CGNVCUDARuntime::transformManagedVars() {
: llvm::ConstantPointerNull::get(Var->getType()),
/*Name=*/"", /*InsertBefore=*/nullptr,
llvm::GlobalVariable::NotThreadLocal,
- CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
+ CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
+ ? LangAS::cuda_device
+ : LangAS::Default));
ManagedVar->setDSOLocal(Var->isDSOLocal());
ManagedVar->setVisibility(Var->getVisibility());
ManagedVar->setExternallyInitialized(true);
@@ -1101,7 +1116,7 @@ void CGNVCUDARuntime::transformManagedVars() {
Var->setName(Twine(ManagedVar->getName()) + ".managed");
// Keep managed variables even if they are not used in device code since
// they need to be allocated by the runtime.
- if (!Var->isDeclaration()) {
+ if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
assert(!ManagedVar->isDeclaration());
CGM.addCompilerUsedGlobal(Var);
CGM.addCompilerUsedGlobal(ManagedVar);
@@ -1159,9 +1174,8 @@ void CGNVCUDARuntime::createOffloadingEntries() {
// Returns module constructor to be added.
llvm::Function *CGNVCUDARuntime::finalizeModule() {
+ transformManagedVars();
if (CGM.getLangOpts().CUDAIsDevice) {
- transformManagedVars();
-
// Mark ODR-used device variables as compiler used to prevent it from being
// eliminated by optimization. This is necessary for device variables
// ODR-used by host functions. Sema correctly marks them as ODR-used no