diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/CodeGen/CGCUDANV.cpp')
-rw-r--r-- | contrib/llvm/tools/clang/lib/CodeGen/CGCUDANV.cpp | 88 |
1 files changed, 68 insertions, 20 deletions
diff --git a/contrib/llvm/tools/clang/lib/CodeGen/CGCUDANV.cpp b/contrib/llvm/tools/clang/lib/CodeGen/CGCUDANV.cpp index 045e19b189dc..6a04d4eea784 100644 --- a/contrib/llvm/tools/clang/lib/CodeGen/CGCUDANV.cpp +++ b/contrib/llvm/tools/clang/lib/CodeGen/CGCUDANV.cpp @@ -38,6 +38,7 @@ private: llvm::Module &TheModule; /// Keeps track of kernel launch stubs emitted in this module llvm::SmallVector<llvm::Function *, 16> EmittedKernels; + llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; /// Keeps track of variables containing handles of GPU binaries. Populated by /// ModuleCtorFunction() and used to create corresponding cleanup calls in /// ModuleDtorFunction() @@ -47,7 +48,7 @@ private: llvm::Constant *getLaunchFn() const; /// Creates a function to register all kernel stubs generated in this module. - llvm::Function *makeRegisterKernelsFn(); + llvm::Function *makeRegisterGlobalsFn(); /// Helper function that generates a constant string and returns a pointer to /// the start of the string. The result of this function can be used anywhere @@ -68,6 +69,10 @@ public: CGNVCUDARuntime(CodeGenModule &CGM); void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; + void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { + DeviceVars.push_back(std::make_pair(&Var, Flags)); + } + /// Creates module constructor function llvm::Function *makeModuleCtorFunction() override; /// Creates module destructor function @@ -93,10 +98,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { // cudaError_t cudaSetupArgument(void *, size_t, size_t) - std::vector<llvm::Type*> Params; - Params.push_back(VoidPtrTy); - Params.push_back(SizeTy); - Params.push_back(SizeTy); + llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, Params, false), "cudaSetupArgument"); @@ -158,19 +160,28 @@ void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, CGF.EmitBlock(EndBlock); } -/// Creates internal function to register all kernel stubs generated in this -/// module with the CUDA runtime. +/// Creates a function that sets up state on the host side for CUDA objects that +/// have a presence on both the host and device sides. Specifically, registers +/// the host side of kernel functions and device global variables with the CUDA +/// runtime. /// \code -/// void __cuda_register_kernels(void** GpuBinaryHandle) { +/// void __cuda_register_globals(void** GpuBinaryHandle) { /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); /// ... /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); +/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); +/// ... +/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); /// } /// \endcode -llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() { +llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { + // No need to register anything + if (EmittedKernels.empty() && DeviceVars.empty()) + return nullptr; + llvm::Function *RegisterKernelsFunc = llvm::Function::Create( llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), - llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule); + llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule); llvm::BasicBlock *EntryBB = llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); CGBuilderTy Builder(CGM, Context); @@ -178,7 +189,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() { // void __cudaRegisterFunction(void **, const char *, char *, const char *, // int, uint3*, uint3*, dim3*, dim3*, int*) - std::vector<llvm::Type *> RegisterFuncParams = { + llvm::Type *RegisterFuncParams[] = { VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( @@ -186,18 +197,44 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() { "__cudaRegisterFunction"); // Extract GpuBinaryHandle passed as the first argument passed to - // __cuda_register_kernels() and generate __cudaRegisterFunction() call for + // __cuda_register_globals() and generate __cudaRegisterFunction() call for // each emitted kernel. llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); for (llvm::Function *Kernel : EmittedKernels) { llvm::Constant *KernelName = makeConstantString(Kernel->getName()); llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); - llvm::Value *args[] = { + llvm::Value *Args[] = { &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, NullPtr, NullPtr, NullPtr, llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; - Builder.CreateCall(RegisterFunc, args); + Builder.CreateCall(RegisterFunc, Args); + } + + // void __cudaRegisterVar(void **, char *, char *, const char *, + // int, int, int, int) + llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, + CharPtrTy, IntTy, IntTy, + IntTy, IntTy}; + llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, RegisterVarParams, false), + "__cudaRegisterVar"); + for (auto &Pair : DeviceVars) { + llvm::GlobalVariable *Var = Pair.first; + unsigned Flags = Pair.second; + llvm::Constant *VarName = makeConstantString(Var->getName()); + uint64_t VarSize = + CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); + llvm::Value *Args[] = { + &GpuBinaryHandlePtr, + Builder.CreateBitCast(Var, VoidPtrTy), + VarName, + VarName, + llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), + llvm::ConstantInt::get(IntTy, VarSize), + llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), + llvm::ConstantInt::get(IntTy, 0)}; + Builder.CreateCall(RegisterVar, Args); } Builder.CreateRetVoid(); @@ -208,15 +245,19 @@ llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() { /// \code /// void __cuda_module_ctor(void*) { /// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); -/// __cuda_register_kernels(Handle0); +/// __cuda_register_globals(Handle0); /// ... /// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); -/// __cuda_register_kernels(HandleN); +/// __cuda_register_globals(HandleN); /// } /// \endcode llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { - // void __cuda_register_kernels(void* handle); - llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn(); + // No need to generate ctors/dtors if there are no GPU binaries. + if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty()) + return nullptr; + + // void __cuda_register_globals(void* handle); + llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); // void ** __cudaRegisterFatBinary(void *); llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), @@ -259,6 +300,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, llvm::ConstantStruct::get(FatbinWrapperTy, Values), "__cuda_fatbin_wrapper"); + // NVIDIA's cuobjdump looks for fatbins in this section. + FatbinWrapper->setSection(".nvFatBinSegment"); // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( @@ -270,8 +313,9 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, CGM.getPointerAlign()); - // Call __cuda_register_kernels(GpuBinaryHandle); - CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall); + // Call __cuda_register_globals(GpuBinaryHandle); + if (RegisterGlobalsFunc) + CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); // Save GpuBinaryHandle so we can unregister it in destructor. GpuBinaryHandles.push_back(GpuBinaryHandle); @@ -291,6 +335,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { /// } /// \endcode llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { + // No need for destructor if we don't have handles to unregister. + if (GpuBinaryHandles.empty()) + return nullptr; + // void __cudaUnregisterFatBinary(void ** handle); llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), |