diff options
Diffstat (limited to 'lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r-- | lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 128 |
1 files changed, 94 insertions, 34 deletions
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 48dcbbf3cabd7..708260429f68e 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -107,6 +107,10 @@ enum OpenMPRTLFunctionNVPTX { /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32 /// global_tid); OMPRTL__kmpc_barrier_simple_spmd, + /// Call to int32_t __kmpc_warp_active_thread_mask(void); + OMPRTL_NVPTX__kmpc_warp_active_thread_mask, + /// Call to void __kmpc_syncwarp(int32_t Mask); + OMPRTL_NVPTX__kmpc_syncwarp, }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -276,7 +280,8 @@ static RecordDecl *buildRecordForGlobalizedVars( } } else { llvm::APInt ArraySize(32, BufSize); - Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0); + Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal, + 0); Field = FieldDecl::Create( C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, C.getTrivialTypeSourceInfo(Type, SourceLocation()), @@ -287,10 +292,11 @@ static RecordDecl *buildRecordForGlobalizedVars( static_cast<CharUnits::QuantityType>( GlobalMemoryAlignment))); Field->addAttr(AlignedAttr::CreateImplicit( - C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true, + C, /*IsAlignmentExpr=*/true, IntegerLiteral::Create(C, Align, C.getIntTypeForBitwidth(32, /*Signed=*/0), - SourceLocation()))); + SourceLocation()), + {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned)); } GlobalizedRD->addDecl(Field); MappedDeclsFields.try_emplace(VD, Field); @@ -790,12 +796,16 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx, case OMPD_teams_distribute_parallel_for_simd: case OMPD_target_update: case OMPD_declare_simd: + case OMPD_declare_variant: case OMPD_declare_target: case OMPD_end_declare_target: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_taskloop: case OMPD_taskloop_simd: + case OMPD_master_taskloop: + case OMPD_master_taskloop_simd: + case OMPD_parallel_master_taskloop: case OMPD_requires: case OMPD_unknown: llvm_unreachable("Unexpected directive."); @@ -860,12 +870,16 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx, case OMPD_teams_distribute_parallel_for_simd: case OMPD_target_update: case OMPD_declare_simd: + case OMPD_declare_variant: case OMPD_declare_target: case OMPD_end_declare_target: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_taskloop: case OMPD_taskloop_simd: + case OMPD_master_taskloop: + case OMPD_master_taskloop_simd: + case OMPD_parallel_master_taskloop: case OMPD_requires: case OMPD_unknown: break; @@ -1023,12 +1037,16 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx, case OMPD_teams_distribute_parallel_for_simd: case OMPD_target_update: case OMPD_declare_simd: + case OMPD_declare_variant: case OMPD_declare_target: case OMPD_end_declare_target: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_taskloop: case OMPD_taskloop_simd: + case OMPD_master_taskloop: + case OMPD_master_taskloop_simd: + case OMPD_parallel_master_taskloop: case OMPD_requires: case OMPD_unknown: llvm_unreachable("Unexpected directive."); @@ -1099,12 +1117,16 @@ static bool supportsLightweightRuntime(ASTContext &Ctx, case OMPD_teams_distribute_parallel_for_simd: case OMPD_target_update: case OMPD_declare_simd: + case OMPD_declare_variant: case OMPD_declare_target: case OMPD_end_declare_target: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_taskloop: case OMPD_taskloop_simd: + case OMPD_master_taskloop: + case OMPD_master_taskloop_simd: + case OMPD_parallel_master_taskloop: case OMPD_requires: case OMPD_unknown: break; @@ -1794,6 +1816,20 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { ->addFnAttr(llvm::Attribute::Convergent); break; } + case OMPRTL_NVPTX__kmpc_warp_active_thread_mask: { + // Build int32_t __kmpc_warp_active_thread_mask(void); + auto *FnTy = + llvm::FunctionType::get(CGM.Int32Ty, llvm::None, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_warp_active_thread_mask"); + break; + } + case OMPRTL_NVPTX__kmpc_syncwarp: { + // Build void __kmpc_syncwarp(kmp_int32 Mask); + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, CGM.Int32Ty, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_syncwarp"); + break; + } } return RTLFn; } @@ -1871,6 +1907,19 @@ unsigned CGOpenMPRuntimeNVPTX::getDefaultLocationReserved2Flags() const { llvm_unreachable("Unknown flags are requested."); } +bool CGOpenMPRuntimeNVPTX::tryEmitDeclareVariant(const GlobalDecl &NewGD, + const GlobalDecl &OldGD, + llvm::GlobalValue *OrigAddr, + bool IsForDefinition) { + // Emit the function in OldGD with the body from NewGD, if NewGD is defined. + auto *NewFD = cast<FunctionDecl>(NewGD.getDecl()); + if (NewFD->isDefined()) { + CGM.emitOpenMPDeviceFunctionRedefinition(OldGD, NewGD, OrigAddr); + return true; + } + return false; +} + CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) : CGOpenMPRuntime(CGM, "_", "$") { if (!CGM.getLangOpts().OpenMPIsDevice) @@ -2030,7 +2079,7 @@ llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; I->getSecond().GlobalRecord = GlobalizedRD; I->getSecond().MappedParams = - llvm::make_unique<CodeGenFunction::OMPMapVars>(); + std::make_unique<CodeGenFunction::OMPMapVars>(); DeclToAddrMapTy &Data = I->getSecond().LocalVarData; for (const auto &Pair : MappedDeclsFields) { assert(Pair.getFirst()->isCanonicalDecl() && @@ -2414,9 +2463,8 @@ void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, if (!CGF.HaveInsertPoint()) return; - Address ZeroAddr = CGF.CreateMemTemp( - CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1), - /*Name*/ ".zero.addr"); + Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, + /*Name=*/".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); @@ -2445,16 +2493,19 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall( // Force inline this outlined function at its call site. Fn->setLinkage(llvm::GlobalValue::InternalLinkage); - Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth( - /*DestWidth=*/32, /*Signed=*/1), - ".zero.addr"); + Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, + /*Name=*/".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); // ThreadId for serialized parallels is 0. Address ThreadIDAddr = ZeroAddr; - auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr]( + auto &&CodeGen = [this, Fn, CapturedVars, Loc, &ThreadIDAddr]( CodeGenFunction &CGF, PrePostActionTy &Action) { Action.Enter(CGF); + Address ZeroAddr = + CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, + /*Name=*/".bound.zero.addr"); + CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); OutlinedFnArgs.push_back(ZeroAddr.getPointer()); @@ -2611,17 +2662,19 @@ void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall( // llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; - Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth( - /*DestWidth=*/32, /*Signed=*/1), - ".zero.addr"); + Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, + /*Name=*/".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); // ThreadId for serialized parallels is 0. Address ThreadIDAddr = ZeroAddr; - auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr, - &ThreadIDAddr](CodeGenFunction &CGF, - PrePostActionTy &Action) { + auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, &ThreadIDAddr]( + CodeGenFunction &CGF, PrePostActionTy &Action) { Action.Enter(CGF); + Address ZeroAddr = + CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, + /*Name=*/".bound.zero.addr"); + CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); OutlinedFnArgs.push_back(ZeroAddr.getPointer()); @@ -2669,8 +2722,9 @@ void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) { llvm::ConstantPointerNull::get( cast<llvm::PointerType>(getIdentTyPointerTy())), llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; - CGF.EmitRuntimeCall( + llvm::CallInst *Call = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args); + Call->setConvergent(); } void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF, @@ -2684,7 +2738,9 @@ void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF, unsigned Flags = getDefaultFlagsForBarriers(Kind); llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), getThreadID(CGF, Loc)}; - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier), Args); + llvm::CallInst *Call = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier), Args); + Call->setConvergent(); } void CGOpenMPRuntimeNVPTX::emitCriticalRegion( @@ -2697,6 +2753,9 @@ void CGOpenMPRuntimeNVPTX::emitCriticalRegion( llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); + // Get the mask of active threads in the warp. + llvm::Value *Mask = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask)); // Fetch team-local id of the thread. llvm::Value *ThreadID = getNVPTXThreadID(CGF); @@ -2737,8 +2796,9 @@ void CGOpenMPRuntimeNVPTX::emitCriticalRegion( // Block waits for all threads in current team to finish then increments the // counter variable and returns to the loop. CGF.EmitBlock(SyncBB); - emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false, - /*ForceSimpleCall=*/true); + // Reconverge active threads in the warp. + (void)CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_syncwarp), Mask); llvm::Value *IncCounterVal = CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1)); @@ -4239,7 +4299,7 @@ void CGOpenMPRuntimeNVPTX::emitReduction( } llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); QualType ReductionArrayTy = - C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, + C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); Address ReductionList = CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); @@ -4515,9 +4575,8 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( const auto *RD = CS.getCapturedRecordDecl(); auto CurField = RD->field_begin(); - Address ZeroAddr = CGF.CreateMemTemp( - CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1), - /*Name*/ ".zero.addr"); + Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, + /*Name=*/".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); // Get the array of arguments. SmallVector<llvm::Value *, 8> Args; @@ -4634,7 +4693,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, return; auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; I->getSecond().MappedParams = - llvm::make_unique<CodeGenFunction::OMPMapVars>(); + std::make_unique<CodeGenFunction::OMPMapVars>(); I->getSecond().GlobalRecord = GlobalizedVarsRecord; I->getSecond().EscapedParameters.insert( VarChecker.getEscapedParameters().begin(), @@ -4700,7 +4759,7 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant)); CharUnits Align = CGM.getContext().getDeclAlign(VD); - GV->setAlignment(Align.getQuantity()); + GV->setAlignment(Align.getAsAlign()); return Address(GV, Align); } case OMPAllocateDeclAttr::OMPPTeamMemAlloc: { @@ -4712,7 +4771,7 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); CharUnits Align = CGM.getContext().getDeclAlign(VD); - GV->setAlignment(Align.getQuantity()); + GV->setAlignment(Align.getAsAlign()); return Address(GV, Align); } case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: @@ -4723,7 +4782,7 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy), VD->getName()); CharUnits Align = CGM.getContext().getDeclAlign(VD); - GV->setAlignment(Align.getQuantity()); + GV->setAlignment(Align.getAsAlign()); return Address(GV, Align); } } @@ -5026,7 +5085,7 @@ void CGOpenMPRuntimeNVPTX::clear() { Size = llvm::alignTo(Size, RecAlignment); llvm::APInt ArySize(/*numBits=*/64, Size); QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0); + C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); const bool UseSharedMemory = Size <= SharedMemorySize; auto *Field = FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD, @@ -5053,7 +5112,7 @@ void CGOpenMPRuntimeNVPTX::clear() { if (!SharedStaticRD->field_empty()) { llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0); + C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); auto *Field = FieldDecl::Create( C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), @@ -5086,11 +5145,12 @@ void CGOpenMPRuntimeNVPTX::clear() { std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM); llvm::APInt Size1(32, SMsBlockPerSM.second); QualType Arr1Ty = - C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal, + C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); llvm::APInt Size2(32, SMsBlockPerSM.first); - QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal, - /*IndexTypeQuals=*/0); + QualType Arr2Ty = + C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, + /*IndexTypeQuals=*/0); llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); // FIXME: nvlink does not handle weak linkage correctly (object with the // different size are reported as erroneous). |