summaryrefslogtreecommitdiff
path: root/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r--lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp128
1 files changed, 94 insertions, 34 deletions
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 48dcbbf3cabd7..708260429f68e 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -107,6 +107,10 @@ enum OpenMPRTLFunctionNVPTX {
/// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
/// global_tid);
OMPRTL__kmpc_barrier_simple_spmd,
+ /// Call to int32_t __kmpc_warp_active_thread_mask(void);
+ OMPRTL_NVPTX__kmpc_warp_active_thread_mask,
+ /// Call to void __kmpc_syncwarp(int32_t Mask);
+ OMPRTL_NVPTX__kmpc_syncwarp,
};
/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
@@ -276,7 +280,8 @@ static RecordDecl *buildRecordForGlobalizedVars(
}
} else {
llvm::APInt ArraySize(32, BufSize);
- Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
+ Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
+ 0);
Field = FieldDecl::Create(
C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
C.getTrivialTypeSourceInfo(Type, SourceLocation()),
@@ -287,10 +292,11 @@ static RecordDecl *buildRecordForGlobalizedVars(
static_cast<CharUnits::QuantityType>(
GlobalMemoryAlignment)));
Field->addAttr(AlignedAttr::CreateImplicit(
- C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true,
+ C, /*IsAlignmentExpr=*/true,
IntegerLiteral::Create(C, Align,
C.getIntTypeForBitwidth(32, /*Signed=*/0),
- SourceLocation())));
+ SourceLocation()),
+ {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
}
GlobalizedRD->addDecl(Field);
MappedDeclsFields.try_emplace(VD, Field);
@@ -790,12 +796,16 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx,
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
+ case OMPD_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
+ case OMPD_master_taskloop:
+ case OMPD_master_taskloop_simd:
+ case OMPD_parallel_master_taskloop:
case OMPD_requires:
case OMPD_unknown:
llvm_unreachable("Unexpected directive.");
@@ -860,12 +870,16 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx,
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
+ case OMPD_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
+ case OMPD_master_taskloop:
+ case OMPD_master_taskloop_simd:
+ case OMPD_parallel_master_taskloop:
case OMPD_requires:
case OMPD_unknown:
break;
@@ -1023,12 +1037,16 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx,
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
+ case OMPD_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
+ case OMPD_master_taskloop:
+ case OMPD_master_taskloop_simd:
+ case OMPD_parallel_master_taskloop:
case OMPD_requires:
case OMPD_unknown:
llvm_unreachable("Unexpected directive.");
@@ -1099,12 +1117,16 @@ static bool supportsLightweightRuntime(ASTContext &Ctx,
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
+ case OMPD_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
+ case OMPD_master_taskloop:
+ case OMPD_master_taskloop_simd:
+ case OMPD_parallel_master_taskloop:
case OMPD_requires:
case OMPD_unknown:
break;
@@ -1794,6 +1816,20 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
->addFnAttr(llvm::Attribute::Convergent);
break;
}
+ case OMPRTL_NVPTX__kmpc_warp_active_thread_mask: {
+ // Build int32_t __kmpc_warp_active_thread_mask(void);
+ auto *FnTy =
+ llvm::FunctionType::get(CGM.Int32Ty, llvm::None, /*isVarArg=*/false);
+ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_warp_active_thread_mask");
+ break;
+ }
+ case OMPRTL_NVPTX__kmpc_syncwarp: {
+ // Build void __kmpc_syncwarp(kmp_int32 Mask);
+ auto *FnTy =
+ llvm::FunctionType::get(CGM.VoidTy, CGM.Int32Ty, /*isVarArg=*/false);
+ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_syncwarp");
+ break;
+ }
}
return RTLFn;
}
@@ -1871,6 +1907,19 @@ unsigned CGOpenMPRuntimeNVPTX::getDefaultLocationReserved2Flags() const {
llvm_unreachable("Unknown flags are requested.");
}
+bool CGOpenMPRuntimeNVPTX::tryEmitDeclareVariant(const GlobalDecl &NewGD,
+ const GlobalDecl &OldGD,
+ llvm::GlobalValue *OrigAddr,
+ bool IsForDefinition) {
+ // Emit the function in OldGD with the body from NewGD, if NewGD is defined.
+ auto *NewFD = cast<FunctionDecl>(NewGD.getDecl());
+ if (NewFD->isDefined()) {
+ CGM.emitOpenMPDeviceFunctionRedefinition(OldGD, NewGD, OrigAddr);
+ return true;
+ }
+ return false;
+}
+
CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
: CGOpenMPRuntime(CGM, "_", "$") {
if (!CGM.getLangOpts().OpenMPIsDevice)
@@ -2030,7 +2079,7 @@ llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
I->getSecond().GlobalRecord = GlobalizedRD;
I->getSecond().MappedParams =
- llvm::make_unique<CodeGenFunction::OMPMapVars>();
+ std::make_unique<CodeGenFunction::OMPMapVars>();
DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
for (const auto &Pair : MappedDeclsFields) {
assert(Pair.getFirst()->isCanonicalDecl() &&
@@ -2414,9 +2463,8 @@ void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
if (!CGF.HaveInsertPoint())
return;
- Address ZeroAddr = CGF.CreateMemTemp(
- CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
- /*Name*/ ".zero.addr");
+ Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
+ /*Name=*/".zero.addr");
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
@@ -2445,16 +2493,19 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
// Force inline this outlined function at its call site.
Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
- Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
- /*DestWidth=*/32, /*Signed=*/1),
- ".zero.addr");
+ Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
+ /*Name=*/".zero.addr");
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
// ThreadId for serialized parallels is 0.
Address ThreadIDAddr = ZeroAddr;
- auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
+ auto &&CodeGen = [this, Fn, CapturedVars, Loc, &ThreadIDAddr](
CodeGenFunction &CGF, PrePostActionTy &Action) {
Action.Enter(CGF);
+ Address ZeroAddr =
+ CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
+ /*Name=*/".bound.zero.addr");
+ CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
@@ -2611,17 +2662,19 @@ void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
//
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
- Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
- /*DestWidth=*/32, /*Signed=*/1),
- ".zero.addr");
+ Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
+ /*Name=*/".zero.addr");
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
// ThreadId for serialized parallels is 0.
Address ThreadIDAddr = ZeroAddr;
- auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
- &ThreadIDAddr](CodeGenFunction &CGF,
- PrePostActionTy &Action) {
+ auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, &ThreadIDAddr](
+ CodeGenFunction &CGF, PrePostActionTy &Action) {
Action.Enter(CGF);
+ Address ZeroAddr =
+ CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
+ /*Name=*/".bound.zero.addr");
+ CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
@@ -2669,8 +2722,9 @@ void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
llvm::ConstantPointerNull::get(
cast<llvm::PointerType>(getIdentTyPointerTy())),
llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
- CGF.EmitRuntimeCall(
+ llvm::CallInst *Call = CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
+ Call->setConvergent();
}
void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF,
@@ -2684,7 +2738,9 @@ void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF,
unsigned Flags = getDefaultFlagsForBarriers(Kind);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
getThreadID(CGF, Loc)};
- CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier), Args);
+ llvm::CallInst *Call = CGF.EmitRuntimeCall(
+ createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier), Args);
+ Call->setConvergent();
}
void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
@@ -2697,6 +2753,9 @@ void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
+ // Get the mask of active threads in the warp.
+ llvm::Value *Mask = CGF.EmitRuntimeCall(
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
// Fetch team-local id of the thread.
llvm::Value *ThreadID = getNVPTXThreadID(CGF);
@@ -2737,8 +2796,9 @@ void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
// Block waits for all threads in current team to finish then increments the
// counter variable and returns to the loop.
CGF.EmitBlock(SyncBB);
- emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false,
- /*ForceSimpleCall=*/true);
+ // Reconverge active threads in the warp.
+ (void)CGF.EmitRuntimeCall(
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_syncwarp), Mask);
llvm::Value *IncCounterVal =
CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
@@ -4239,7 +4299,7 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
}
llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
QualType ReductionArrayTy =
- C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
+ C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
/*IndexTypeQuals=*/0);
Address ReductionList =
CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
@@ -4515,9 +4575,8 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
const auto *RD = CS.getCapturedRecordDecl();
auto CurField = RD->field_begin();
- Address ZeroAddr = CGF.CreateMemTemp(
- CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
- /*Name*/ ".zero.addr");
+ Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
+ /*Name=*/".zero.addr");
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
// Get the array of arguments.
SmallVector<llvm::Value *, 8> Args;
@@ -4634,7 +4693,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
return;
auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
I->getSecond().MappedParams =
- llvm::make_unique<CodeGenFunction::OMPMapVars>();
+ std::make_unique<CodeGenFunction::OMPMapVars>();
I->getSecond().GlobalRecord = GlobalizedVarsRecord;
I->getSecond().EscapedParameters.insert(
VarChecker.getEscapedParameters().begin(),
@@ -4700,7 +4759,7 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
/*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant));
CharUnits Align = CGM.getContext().getDeclAlign(VD);
- GV->setAlignment(Align.getQuantity());
+ GV->setAlignment(Align.getAsAlign());
return Address(GV, Align);
}
case OMPAllocateDeclAttr::OMPPTeamMemAlloc: {
@@ -4712,7 +4771,7 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
/*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
CharUnits Align = CGM.getContext().getDeclAlign(VD);
- GV->setAlignment(Align.getQuantity());
+ GV->setAlignment(Align.getAsAlign());
return Address(GV, Align);
}
case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
@@ -4723,7 +4782,7 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
llvm::GlobalValue::InternalLinkage,
llvm::Constant::getNullValue(VarTy), VD->getName());
CharUnits Align = CGM.getContext().getDeclAlign(VD);
- GV->setAlignment(Align.getQuantity());
+ GV->setAlignment(Align.getAsAlign());
return Address(GV, Align);
}
}
@@ -5026,7 +5085,7 @@ void CGOpenMPRuntimeNVPTX::clear() {
Size = llvm::alignTo(Size, RecAlignment);
llvm::APInt ArySize(/*numBits=*/64, Size);
QualType SubTy = C.getConstantArrayType(
- C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
+ C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
const bool UseSharedMemory = Size <= SharedMemorySize;
auto *Field =
FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
@@ -5053,7 +5112,7 @@ void CGOpenMPRuntimeNVPTX::clear() {
if (!SharedStaticRD->field_empty()) {
llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
QualType SubTy = C.getConstantArrayType(
- C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
+ C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
auto *Field = FieldDecl::Create(
C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
@@ -5086,11 +5145,12 @@ void CGOpenMPRuntimeNVPTX::clear() {
std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
llvm::APInt Size1(32, SMsBlockPerSM.second);
QualType Arr1Ty =
- C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
+ C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
/*IndexTypeQuals=*/0);
llvm::APInt Size2(32, SMsBlockPerSM.first);
- QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
- /*IndexTypeQuals=*/0);
+ QualType Arr2Ty =
+ C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
+ /*IndexTypeQuals=*/0);
llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
// FIXME: nvlink does not handle weak linkage correctly (object with the
// different size are reported as erroneous).