diff options
Diffstat (limited to 'contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp')
-rw-r--r-- | contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp | 110 |
1 files changed, 62 insertions, 48 deletions
diff --git a/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp b/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp index 5b43272bfa62..43dfbbb90dd5 100644 --- a/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp +++ b/contrib/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp @@ -71,8 +71,6 @@ private: bool RelocatableDeviceCode; /// Mangle context for device. std::unique_ptr<MangleContext> DeviceMC; - /// Some zeros used for GEPs. - llvm::Constant *Zeros[2]; llvm::FunctionCallee getSetupArgumentFn() const; llvm::FunctionCallee getLaunchFn() const; @@ -91,9 +89,7 @@ private: /// where the C code specifies const char*. llvm::Constant *makeConstantString(const std::string &Str, const std::string &Name = "") { - auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); - return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), - ConstStr.getPointer(), Zeros); + return CGM.GetAddrOfConstantCString(Str, Name.c_str()).getPointer(); } /// Helper function which generates an initialized constant array from Str, @@ -117,7 +113,7 @@ private: } if (Alignment) GV->setAlignment(llvm::Align(Alignment)); - return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros); + return GV; } /// Helper function that generates an empty dummy function returning void. @@ -230,8 +226,6 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) IntTy = CGM.IntTy; SizeTy = CGM.SizeTy; VoidTy = CGM.VoidTy; - Zeros[0] = llvm::ConstantInt::get(SizeTy, 0); - Zeros[1] = Zeros[0]; PtrTy = CGM.UnqualPtrTy; } @@ -331,11 +325,11 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); // Store pointers to the arguments in a locally allocated launch_args. for (unsigned i = 0; i < Args.size(); ++i) { - llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); + llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF); llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy); CGF.Builder.CreateDefaultAlignedStore( - VoidVarPtr, - CGF.Builder.CreateConstGEP1_32(PtrTy, KernelArgs.getPointer(), i)); + VoidVarPtr, CGF.Builder.CreateConstGEP1_32( + PtrTy, KernelArgs.emitRawPointer(CGF), i)); } llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); @@ -361,7 +355,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, KernelLaunchAPI = KernelLaunchAPI + "_ptsz"; } auto LaunchKernelName = addPrefixToName(KernelLaunchAPI); - IdentifierInfo &cudaLaunchKernelII = + const IdentifierInfo &cudaLaunchKernelII = CGM.getContext().Idents.get(LaunchKernelName); FunctionDecl *cudaLaunchKernelFD = nullptr; for (auto *Result : DC->lookup(&cudaLaunchKernelII)) { @@ -393,9 +387,10 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, /*isVarArg=*/false), addUnderscoredPrefixToName("PopCallConfiguration")); - CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, - {GridDim.getPointer(), BlockDim.getPointer(), - ShmemSize.getPointer(), Stream.getPointer()}); + CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF), + BlockDim.emitRawPointer(CGF), + ShmemSize.emitRawPointer(CGF), + Stream.emitRawPointer(CGF)}); // Emit the call to cudaLaunch llvm::Value *Kernel = @@ -405,7 +400,7 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, cudaLaunchKernelFD->getParamDecl(0)->getType()); LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); - LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), + LaunchKernelArgs.add(RValue::get(KernelArgs, CGF), cudaLaunchKernelFD->getParamDecl(3)->getType()); LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), cudaLaunchKernelFD->getParamDecl(4)->getType()); @@ -423,6 +418,33 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, CGM.CreateRuntimeFunction(FTy, LaunchKernelName); CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), LaunchKernelArgs); + + // To prevent CUDA device stub functions from being merged by ICF in MSVC + // environment, create an unique global variable for each kernel and write to + // the variable in the device stub. + if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && + !CGF.getLangOpts().HIP) { + llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel); + std::string GlobalVarName = (KernelFunction->getName() + ".id").str(); + + llvm::GlobalVariable *HandleVar = + CGM.getModule().getNamedGlobal(GlobalVarName); + if (!HandleVar) { + HandleVar = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, + /*Constant=*/false, KernelFunction->getLinkage(), + llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName); + HandleVar->setDSOLocal(KernelFunction->isDSOLocal()); + HandleVar->setVisibility(KernelFunction->getVisibility()); + if (KernelFunction->hasComdat()) + HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName)); + } + + CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1), + HandleVar, CharUnits::One(), + /*IsVolatile=*/true); + } + CGF.EmitBranch(EndBlock); CGF.EmitBlock(EndBlock); @@ -438,8 +460,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType()); Offset = Offset.alignTo(TInfo.Align); llvm::Value *Args[] = { - CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), - PtrTy), + CGF.Builder.CreatePointerCast( + CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy), llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()), llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), }; @@ -491,7 +513,8 @@ static void replaceManagedVar(llvm::GlobalVariable *Var, // variable with instructions. for (auto &&Op : WorkItem) { auto *CE = cast<llvm::ConstantExpr>(Op); - auto *NewInst = CE->getAsInstruction(I); + auto *NewInst = CE->getAsInstruction(); + NewInst->insertBefore(*I->getParent(), I->getIterator()); NewInst->replaceUsesOfWith(OldV, NewV); OldV = CE; NewV = NewInst; @@ -604,20 +627,10 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { uint64_t VarSize = CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); if (Info.Flags.isManaged()) { - auto *ManagedVar = new llvm::GlobalVariable( - CGM.getModule(), Var->getType(), - /*isConstant=*/false, Var->getLinkage(), - /*Init=*/Var->isDeclaration() - ? nullptr - : llvm::ConstantPointerNull::get(Var->getType()), - /*Name=*/"", /*InsertBefore=*/nullptr, - llvm::GlobalVariable::NotThreadLocal); - ManagedVar->setDSOLocal(Var->isDSOLocal()); - ManagedVar->setVisibility(Var->getVisibility()); - ManagedVar->setExternallyInitialized(true); - ManagedVar->takeName(Var); - Var->setName(Twine(ManagedVar->getName() + ".managed")); - replaceManagedVar(Var, ManagedVar); + assert(Var->getName().ends_with(".managed") && + "HIP managed variables not transformed"); + auto *ManagedVar = CGM.getModule().getNamedGlobal( + Var->getName().drop_back(StringRef(".managed").size())); llvm::Value *Args[] = { &GpuBinaryHandlePtr, ManagedVar, @@ -760,10 +773,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // to contain the fat binary but will be populated somewhere else, // e.g. by lld through link script. FatBinStr = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, - /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, - "__hip_fatbin", nullptr, - llvm::GlobalVariable::NotThreadLocal); + CGM.getModule(), CGM.Int8Ty, + /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, + "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr, + llvm::GlobalVariable::NotThreadLocal); cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); } @@ -816,8 +829,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // thread safety of the loaded program. Therefore we can assume sequential // execution of constructor functions here. if (IsHIP) { - auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : - llvm::GlobalValue::LinkOnceAnyLinkage; + auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage + : llvm::GlobalValue::ExternalLinkage; llvm::BasicBlock *IfBlock = llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); llvm::BasicBlock *ExitBlock = @@ -826,11 +839,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // of HIP ABI. GpuBinaryHandle = new llvm::GlobalVariable( TheModule, PtrTy, /*isConstant=*/false, Linkage, - /*Initializer=*/llvm::ConstantPointerNull::get(PtrTy), - "__hip_gpubin_handle"); - if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage) - GpuBinaryHandle->setComdat( - CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName())); + /*Initializer=*/ + CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr, + CudaGpuBinary + ? "__hip_gpubin_handle" + : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash()); GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); // Prevent the weak symbol in different shared libraries being merged. if (Linkage != llvm::GlobalValue::InternalLinkage) @@ -1092,7 +1105,9 @@ void CGNVCUDARuntime::transformManagedVars() { : llvm::ConstantPointerNull::get(Var->getType()), /*Name=*/"", /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); + CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice + ? LangAS::cuda_device + : LangAS::Default)); ManagedVar->setDSOLocal(Var->isDSOLocal()); ManagedVar->setVisibility(Var->getVisibility()); ManagedVar->setExternallyInitialized(true); @@ -1101,7 +1116,7 @@ void CGNVCUDARuntime::transformManagedVars() { Var->setName(Twine(ManagedVar->getName()) + ".managed"); // Keep managed variables even if they are not used in device code since // they need to be allocated by the runtime. - if (!Var->isDeclaration()) { + if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) { assert(!ManagedVar->isDeclaration()); CGM.addCompilerUsedGlobal(Var); CGM.addCompilerUsedGlobal(ManagedVar); @@ -1159,9 +1174,8 @@ void CGNVCUDARuntime::createOffloadingEntries() { // Returns module constructor to be added. llvm::Function *CGNVCUDARuntime::finalizeModule() { + transformManagedVars(); if (CGM.getLangOpts().CUDAIsDevice) { - transformManagedVars(); - // Mark ODR-used device variables as compiler used to prevent it from being // eliminated by optimization. This is necessary for device variables // ODR-used by host functions. Sema correctly marks them as ODR-used no |