diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:11:37 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:11:37 +0000 |
commit | 461a67fa15370a9ec88f8f8a240bf7c123bb2029 (patch) | |
tree | 6942083d7d56bba40ec790a453ca58ad3baf6832 /lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | |
parent | 75c3240472ba6ac2669ee72ca67eb72d4e2851fc (diff) |
Notes
Diffstat (limited to 'lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r-- | lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 362 |
1 files changed, 304 insertions, 58 deletions
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 3ced05d08a47c..b5fc8d308067e 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -22,19 +22,21 @@ using namespace CodeGen; namespace { enum OpenMPRTLFunctionNVPTX { - /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit); + /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit, + /// int16_t RequiresOMPRuntime); OMPRTL_NVPTX__kmpc_kernel_init, - /// \brief Call to void __kmpc_kernel_deinit(); + /// \brief Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); OMPRTL_NVPTX__kmpc_kernel_deinit, /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, - /// short RequiresOMPRuntime, short RequiresDataSharing); + /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); OMPRTL_NVPTX__kmpc_spmd_kernel_init, /// \brief Call to void __kmpc_spmd_kernel_deinit(); OMPRTL_NVPTX__kmpc_spmd_kernel_deinit, /// \brief Call to void __kmpc_kernel_prepare_parallel(void - /// *outlined_function); + /// *outlined_function, void ***args, kmp_int32 nArgs); OMPRTL_NVPTX__kmpc_kernel_prepare_parallel, - /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function); + /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function, void + /// ***args); OMPRTL_NVPTX__kmpc_kernel_parallel, /// \brief Call to void __kmpc_kernel_end_parallel(); OMPRTL_NVPTX__kmpc_kernel_end_parallel, @@ -150,20 +152,18 @@ enum NamedBarrier : unsigned { /// Get the GPU warp size. static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - return Bld.CreateCall( + return CGF.EmitRuntimeCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), - llvm::None, "nvptx_warp_size"); + "nvptx_warp_size"); } /// Get the id of the current thread on the GPU. static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - return Bld.CreateCall( + return CGF.EmitRuntimeCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), - llvm::None, "nvptx_tid"); + "nvptx_tid"); } /// Get the id of the warp in the block. @@ -185,17 +185,15 @@ static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { /// Get the maximum number of threads in a block of the GPU. static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - return Bld.CreateCall( + return CGF.EmitRuntimeCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), - llvm::None, "nvptx_num_threads"); + "nvptx_num_threads"); } /// Get barrier to synchronize all threads in a block. static void getNVPTXCTABarrier(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - Bld.CreateCall(llvm::Intrinsic::getDeclaration( + CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); } @@ -205,9 +203,9 @@ static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, llvm::Value *NumThreads) { CGBuilderTy &Bld = CGF.Builder; llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads}; - Bld.CreateCall(llvm::Intrinsic::getDeclaration(&CGF.CGM.getModule(), - llvm::Intrinsic::nvvm_barrier), - Args); + CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier), + Args); } /// Synchronize all GPU threads in a block. @@ -280,6 +278,8 @@ getExecutionModeForDirective(CodeGenModule &CGM, case OMPD_target_teams: return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic; case OMPD_target_parallel: + case OMPD_target_parallel_for: + case OMPD_target_parallel_for_simd: return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd; default: llvm_unreachable("Unsupported directive on NVPTX device."); @@ -298,6 +298,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, EntryFunctionState EST; WorkerFunctionState WST(CGM); Work.clear(); + WrapperFunctionsMap.clear(); // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { @@ -345,7 +346,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); CGF.EmitBlock(WorkerBB); - CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); + emitCall(CGF, WST.WorkerFn); CGF.EmitBranch(EST.ExitBB); CGF.EmitBlock(MasterCheckBB); @@ -356,7 +357,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, CGF.EmitBlock(MasterBB); // First action in sequential region: // Initialize the state of the OpenMP runtime library on the GPU. - llvm::Value *Args[] = {getThreadLimit(CGF)}; + // TODO: Optimize runtime initialization and pass in correct value. + llvm::Value *Args[] = {getThreadLimit(CGF), + Bld.getInt16(/*RequiresOMPRuntime=*/1)}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); } @@ -371,8 +374,10 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, CGF.EmitBlock(TerminateBB); // Signal termination condition. + // TODO: Optimize runtime initialization and pass in correct value. + llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)}; CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None); + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args); // Barrier to terminate worker threads. syncCTAThreads(CGF); // Master thread jumps to exit point. @@ -413,7 +418,6 @@ void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D, CodeGen.setAction(Action); emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); - return; } void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader( @@ -471,7 +475,7 @@ static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, } void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { - auto &Ctx = CGM.getContext(); + ASTContext &Ctx = CGM.getContext(); CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); CGF.disableDebugInfo(); @@ -514,7 +518,10 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); - llvm::Value *Args[] = {WorkFn.getPointer()}; + // Set up shared arguments + Address SharedArgs = + CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args"); + llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer()}; llvm::Value *Ret = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); @@ -533,6 +540,9 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, // Signal start of parallel region. CGF.EmitBlock(ExecuteBB); + // Current context + ASTContext &Ctx = CGF.getContext(); + // Process work items: outlined parallel functions. for (auto *W : Work) { // Try to match this outlined function. @@ -548,14 +558,18 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, // Execute this outlined function. CGF.EmitBlock(ExecuteFNBB); - // Insert call to work function. - // FIXME: Pass arguments to outlined function from master thread. - auto *Fn = cast<llvm::Function>(W); - Address ZeroAddr = - CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0)); - llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()}; - CGF.EmitCallOrInvoke(Fn, FnArgs); + // Insert call to work function via shared wrapper. The shared + // wrapper takes exactly three arguments: + // - the parallelism level; + // - the master thread ID; + // - the list of references to shared arguments. + // + // TODO: Assert that the function is a wrapper function.s + Address Capture = CGF.EmitLoadOfPointer(SharedArgs, + Ctx.getPointerType( + Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>()); + emitCall(CGF, W, {Bld.getInt16(/*ParallelLevel=*/0), + getMasterThreadID(CGF), Capture.getPointer()}); // Go to end of parallel region. CGF.EmitBranch(TerminateBB); @@ -589,23 +603,25 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { llvm::Constant *RTLFn = nullptr; switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) { case OMPRTL_NVPTX__kmpc_kernel_init: { - // Build void __kmpc_kernel_init(kmp_int32 thread_limit); - llvm::Type *TypeParams[] = {CGM.Int32Ty}; + // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t + // RequiresOMPRuntime); + llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); break; } case OMPRTL_NVPTX__kmpc_kernel_deinit: { - // Build void __kmpc_kernel_deinit(); + // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); + llvm::Type *TypeParams[] = {CGM.Int16Ty}; llvm::FunctionType *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); break; } case OMPRTL_NVPTX__kmpc_spmd_kernel_init: { // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, - // short RequiresOMPRuntime, short RequiresDataSharing); + // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); @@ -621,16 +637,18 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { } case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { /// Build void __kmpc_kernel_prepare_parallel( - /// void *outlined_function); - llvm::Type *TypeParams[] = {CGM.Int8PtrTy}; + /// void *outlined_function, void ***args, kmp_int32 nArgs); + llvm::Type *TypeParams[] = {CGM.Int8PtrTy, + CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); break; } case OMPRTL_NVPTX__kmpc_kernel_parallel: { - /// Build bool __kmpc_kernel_parallel(void **outlined_function); - llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy}; + /// Build bool __kmpc_kernel_parallel(void **outlined_function, void ***args); + llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, + CGM.Int8PtrPtrTy->getPointerTo(0)}; llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy); llvm::FunctionType *FnTy = llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false); @@ -849,8 +867,17 @@ void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { - return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar, - InnermostKind, CodeGen); + + auto *OutlinedFun = cast<llvm::Function>( + CGOpenMPRuntime::emitParallelOutlinedFunction( + D, ThreadIDVar, InnermostKind, CodeGen)); + if (!isInSpmdExecutionMode()) { + llvm::Function *WrapperFun = + createDataSharingWrapper(OutlinedFun, D); + WrapperFunctionsMap[OutlinedFun] = WrapperFun; + } + + return OutlinedFun; } llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( @@ -883,7 +910,7 @@ void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, OutlinedFnArgs.push_back(ZeroAddr.getPointer()); OutlinedFnArgs.push_back(ZeroAddr.getPointer()); OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); + emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); } void CGOpenMPRuntimeNVPTX::emitParallelCall( @@ -902,15 +929,54 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { llvm::Function *Fn = cast<llvm::Function>(OutlinedFn); + llvm::Function *WFn = WrapperFunctionsMap[Fn]; + assert(WFn && "Wrapper function does not exist!"); + + // Force inline this outlined function at its call site. + Fn->setLinkage(llvm::GlobalValue::InternalLinkage); - auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF, + PrePostActionTy &) { CGBuilderTy &Bld = CGF.Builder; - // Prepare for parallel region. Indicate the outlined function. - llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), - Args); + llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); + + if (!CapturedVars.empty()) { + // There's somehting to share, add the attribute + CGF.CurFn->addFnAttr("has-nvptx-shared-depot"); + // Prepare for parallel region. Indicate the outlined function. + Address SharedArgs = + CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, + "shared_args"); + llvm::Value *SharedArgsPtr = SharedArgs.getPointer(); + llvm::Value *Args[] = {ID, SharedArgsPtr, + Bld.getInt32(CapturedVars.size())}; + + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), + Args); + + unsigned Idx = 0; + ASTContext &Ctx = CGF.getContext(); + for (llvm::Value *V : CapturedVars) { + Address Dst = Bld.CreateConstInBoundsGEP( + CGF.EmitLoadOfPointer(SharedArgs, + Ctx.getPointerType( + Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>()), + Idx, CGF.getPointerSize()); + llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy); + CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, + Ctx.getPointerType(Ctx.VoidPtrTy)); + Idx++; + } + } else { + llvm::Value *Args[] = {ID, + llvm::ConstantPointerNull::get(CGF.VoidPtrPtrTy->getPointerTo(0)), + /*nArgs=*/Bld.getInt32(0)}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), + Args); + } // Activate workers. This barrier is used by the master to signal // work for the workers. @@ -925,17 +991,17 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( syncCTAThreads(CGF); // Remember for post-processing in worker loop. - Work.push_back(Fn); + Work.emplace_back(WFn); }; auto *RTLoc = emitUpdateLocation(CGF, Loc); auto *ThreadID = getThreadID(CGF, Loc); llvm::Value *Args[] = {RTLoc, ThreadID}; - auto &&SeqGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF, - PrePostActionTy &) { - auto &&CodeGen = [this, Fn, &CapturedVars](CodeGenFunction &CGF, - PrePostActionTy &Action) { + auto &&SeqGen = [this, Fn, &CapturedVars, &Args, Loc](CodeGenFunction &CGF, + PrePostActionTy &) { + auto &&CodeGen = [this, Fn, &CapturedVars, Loc](CodeGenFunction &CGF, + PrePostActionTy &Action) { Action.Enter(CGF); llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; @@ -944,7 +1010,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( OutlinedFnArgs.push_back( llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs); + emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs); }; RegionCodeGenTy RCG(CodeGen); @@ -980,7 +1046,7 @@ void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall( OutlinedFnArgs.push_back( llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); + emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); } /// This function creates calls to one of two shuffle functions to copy @@ -2238,3 +2304,183 @@ void CGOpenMPRuntimeNVPTX::emitReduction( CGF.EmitBranch(DefaultBB); CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); } + +const VarDecl * +CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD, + const VarDecl *NativeParam) const { + if (!NativeParam->getType()->isReferenceType()) + return NativeParam; + QualType ArgType = NativeParam->getType(); + QualifierCollector QC; + const Type *NonQualTy = QC.strip(ArgType); + QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); + if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) { + if (Attr->getCaptureKind() == OMPC_map) { + PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, + LangAS::opencl_global); + } + } + ArgType = CGM.getContext().getPointerType(PointeeTy); + QC.addRestrict(); + enum { NVPTX_local_addr = 5 }; + QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr)); + ArgType = QC.apply(CGM.getContext(), ArgType); + if (isa<ImplicitParamDecl>(NativeParam)) { + return ImplicitParamDecl::Create( + CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), + NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other); + } + return ParmVarDecl::Create( + CGM.getContext(), + const_cast<DeclContext *>(NativeParam->getDeclContext()), + NativeParam->getLocStart(), NativeParam->getLocation(), + NativeParam->getIdentifier(), ArgType, + /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr); +} + +Address +CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF, + const VarDecl *NativeParam, + const VarDecl *TargetParam) const { + assert(NativeParam != TargetParam && + NativeParam->getType()->isReferenceType() && + "Native arg must not be the same as target arg."); + Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam); + QualType NativeParamType = NativeParam->getType(); + QualifierCollector QC; + const Type *NonQualTy = QC.strip(NativeParamType); + QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); + unsigned NativePointeeAddrSpace = + CGF.getContext().getTargetAddressSpace(NativePointeeTy); + QualType TargetTy = TargetParam->getType(); + llvm::Value *TargetAddr = CGF.EmitLoadOfScalar( + LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation()); + // First cast to generic. + TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo( + /*AddrSpace=*/0)); + // Cast from generic to native address space. + TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo( + NativePointeeAddrSpace)); + Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType); + CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false, + NativeParamType); + return NativeParamAddr; +} + +void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall( + CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, + ArrayRef<llvm::Value *> Args) const { + SmallVector<llvm::Value *, 4> TargetArgs; + TargetArgs.reserve(Args.size()); + auto *FnType = + cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType()); + for (unsigned I = 0, E = Args.size(); I < E; ++I) { + if (FnType->isVarArg() && FnType->getNumParams() <= I) { + TargetArgs.append(std::next(Args.begin(), I), Args.end()); + break; + } + llvm::Type *TargetType = FnType->getParamType(I); + llvm::Value *NativeArg = Args[I]; + if (!TargetType->isPointerTy()) { + TargetArgs.emplace_back(NativeArg); + continue; + } + llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + NativeArg, NativeArg->getType()->getPointerElementType()->getPointerTo( + /*AddrSpace=*/0)); + TargetArgs.emplace_back( + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType)); + } + CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); +} + +/// Emit function which wraps the outline parallel region +/// and controls the arguments which are passed to this function. +/// The wrapper ensures that the outlined function is called +/// with the correct arguments when data is shared. +llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper( + llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { + ASTContext &Ctx = CGM.getContext(); + const auto &CS = *cast<CapturedStmt>(D.getAssociatedStmt()); + + // Create a function that takes as argument the source thread. + FunctionArgList WrapperArgs; + QualType Int16QTy = + Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); + QualType Int32QTy = + Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); + QualType Int32PtrQTy = Ctx.getPointerType(Int32QTy); + QualType VoidPtrPtrQTy = Ctx.getPointerType(Ctx.VoidPtrTy); + ImplicitParamDecl ParallelLevelArg(Ctx, Int16QTy, ImplicitParamDecl::Other); + ImplicitParamDecl WrapperArg(Ctx, Int32QTy, ImplicitParamDecl::Other); + ImplicitParamDecl SharedArgsList(Ctx, VoidPtrPtrQTy, + ImplicitParamDecl::Other); + WrapperArgs.emplace_back(&ParallelLevelArg); + WrapperArgs.emplace_back(&WrapperArg); + WrapperArgs.emplace_back(&SharedArgsList); + + auto &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); + + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI); + Fn->setLinkage(llvm::GlobalValue::InternalLinkage); + + CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); + CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs); + + const auto *RD = CS.getCapturedRecordDecl(); + auto CurField = RD->field_begin(); + + // Get the array of arguments. + SmallVector<llvm::Value *, 8> Args; + + // TODO: suppport SIMD and pass actual values + Args.emplace_back(llvm::ConstantPointerNull::get( + CGM.Int32Ty->getPointerTo())); + Args.emplace_back(llvm::ConstantPointerNull::get( + CGM.Int32Ty->getPointerTo())); + + CGBuilderTy &Bld = CGF.Builder; + auto CI = CS.capture_begin(); + + // Load the start of the array + auto SharedArgs = + CGF.EmitLoadOfPointer(CGF.GetAddrOfLocalVar(&SharedArgsList), + VoidPtrPtrQTy->castAs<PointerType>()); + + // For each captured variable + for (unsigned I = 0; I < CS.capture_size(); ++I, ++CI, ++CurField) { + // Name of captured variable + StringRef Name; + if (CI->capturesThis()) + Name = "this"; + else + Name = CI->getCapturedVar()->getName(); + + // We retrieve the CLANG type of the argument. We use it to create + // an alloca which will give us the LLVM type. + QualType ElemTy = CurField->getType(); + // If this is a capture by copy the element type has to be the pointer to + // the data. + if (CI->capturesVariableByCopy()) + ElemTy = Ctx.getPointerType(ElemTy); + + // Get shared address of the captured variable. + Address ArgAddress = Bld.CreateConstInBoundsGEP( + SharedArgs, I, CGF.getPointerSize()); + Address TypedArgAddress = Bld.CreateBitCast( + ArgAddress, CGF.ConvertTypeForMem(Ctx.getPointerType(ElemTy))); + llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedArgAddress, + /*Volatile=*/false, Int32PtrQTy, SourceLocation()); + Args.emplace_back(Arg); + } + + emitCall(CGF, OutlinedParallelFn, Args); + CGF.FinishFunction(); + return Fn; +} |