aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp')
-rw-r--r--contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp1575
1 files changed, 184 insertions, 1391 deletions
diff --git a/contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 299ee1460b3d..8965a14d88a6 100644
--- a/contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/contrib/llvm-project/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -92,9 +92,9 @@ static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
Base = TempASE->getBase()->IgnoreParenImpCasts();
RefExpr = Base;
- } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
+ } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {
const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
- while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+ while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
Base = TempOASE->getBase()->IgnoreParenImpCasts();
while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
Base = TempASE->getBase()->IgnoreParenImpCasts();
@@ -501,31 +501,6 @@ public:
};
} // anonymous namespace
-/// Get the id of the warp in the block.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
-}
-
-/// Get the id of the current lane in the Warp.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
- unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
- "nvptx_lane_id");
-}
-
CGOpenMPRuntimeGPU::ExecutionMode
CGOpenMPRuntimeGPU::getExecutionMode() const {
return CurrentExecutionMode;
@@ -646,7 +621,6 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx,
case OMPD_target:
case OMPD_target_teams:
return hasNestedSPMDDirective(Ctx, D);
- case OMPD_target_teams_loop:
case OMPD_target_parallel_loop:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
@@ -658,6 +632,12 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx,
return true;
case OMPD_target_teams_distribute:
return false;
+ case OMPD_target_teams_loop:
+ // Whether this is true or not depends on how the directive will
+ // eventually be emitted.
+ if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))
+ return TTLD->canBeParallelFor();
+ return false;
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
@@ -1096,14 +1076,15 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
- LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
- Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
+ LValue VarAddr =
+ CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);
+ Rec.second.PrivateAddr = VarAddr.getAddress();
Rec.second.GlobalizedVal = VoidPtr;
// Assign the local allocation to the newly globalized location.
if (EscapedParam) {
CGF.EmitStoreOfScalar(ParValue, VarAddr);
- I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
+ I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
}
if (auto *DI = CGF.getDebugInfo())
VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
@@ -1117,7 +1098,7 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
CGM.getContext().getDeclAlign(VD),
AlignmentSource::Decl);
- I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));
+ I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress());
}
I->getSecond().MappedParams->apply(CGF);
}
@@ -1206,8 +1187,8 @@ void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
- Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
- /*Name=*/".zero.addr");
+ RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
+ /*Name=*/".zero.addr");
CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
// We don't emit any thread id function call in bare kernel, but because the
@@ -1215,7 +1196,7 @@ void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
if (IsBareKernel)
OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
else
- OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
+ OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
@@ -1289,7 +1270,7 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
llvm::ConstantInt::get(CGF.Int32Ty, -1),
FnPtr,
ID,
- Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
+ Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
CGF.VoidPtrPtrTy),
llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
@@ -1429,1132 +1410,6 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
TBAAAccessInfo());
}
-/// This function creates calls to one of two shuffle functions to copy
-/// variables between lanes in a warp.
-static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
- llvm::Value *Elem,
- QualType ElemType,
- llvm::Value *Offset,
- SourceLocation Loc) {
- CodeGenModule &CGM = CGF.CGM;
- CGBuilderTy &Bld = CGF.Builder;
- CGOpenMPRuntimeGPU &RT =
- *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
- llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
-
- CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
- assert(Size.getQuantity() <= 8 &&
- "Unsupported bitwidth in shuffle instruction.");
-
- RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
- ? OMPRTL___kmpc_shuffle_int32
- : OMPRTL___kmpc_shuffle_int64;
-
- // Cast all types to 32- or 64-bit values before calling shuffle routines.
- QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
- Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
- llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
- llvm::Value *WarpSize =
- Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
-
- llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
- {ElemCast, Offset, WarpSize});
-
- return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
-}
-
-static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
- Address DestAddr, QualType ElemType,
- llvm::Value *Offset, SourceLocation Loc) {
- CGBuilderTy &Bld = CGF.Builder;
-
- CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
- // Create the loop over the big sized data.
- // ptr = (void*)Elem;
- // ptrEnd = (void*) Elem + 1;
- // Step = 8;
- // while (ptr + Step < ptrEnd)
- // shuffle((int64_t)*ptr);
- // Step = 4;
- // while (ptr + Step < ptrEnd)
- // shuffle((int32_t)*ptr);
- // ...
- Address ElemPtr = DestAddr;
- Address Ptr = SrcAddr;
- Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
- Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
- for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
- if (Size < CharUnits::fromQuantity(IntSize))
- continue;
- QualType IntType = CGF.getContext().getIntTypeForBitwidth(
- CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
- /*Signed=*/1);
- llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
- Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
- IntTy);
- ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtr, IntTy->getPointerTo(), IntTy);
- if (Size.getQuantity() / IntSize > 1) {
- llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
- llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
- llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
- CGF.EmitBlock(PreCondBB);
- llvm::PHINode *PhiSrc =
- Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
- PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
- llvm::PHINode *PhiDest =
- Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
- PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
- Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
- ElemPtr =
- Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
- llvm::Value *PtrDiff = Bld.CreatePtrDiff(
- CGF.Int8Ty, PtrEnd.getPointer(),
- Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
- CGF.VoidPtrTy));
- Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
- ThenBB, ExitBB);
- CGF.EmitBlock(ThenBB);
- llvm::Value *Res = createRuntimeShuffleFunction(
- CGF,
- CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo()),
- IntType, Offset, Loc);
- CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
- Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
- PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
- PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
- CGF.EmitBranch(PreCondBB);
- CGF.EmitBlock(ExitBB);
- } else {
- llvm::Value *Res = createRuntimeShuffleFunction(
- CGF,
- CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo()),
- IntType, Offset, Loc);
- CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Ptr = Bld.CreateConstGEP(Ptr, 1);
- ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
- }
- Size = Size % IntSize;
- }
-}
-
-namespace {
-enum CopyAction : unsigned {
- // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
- // the warp using shuffle instructions.
- RemoteLaneToThread,
- // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
- ThreadCopy,
-};
-} // namespace
-
-struct CopyOptionsTy {
- llvm::Value *RemoteLaneOffset;
- llvm::Value *ScratchpadIndex;
- llvm::Value *ScratchpadWidth;
-};
-
-/// Emit instructions to copy a Reduce list, which contains partially
-/// aggregated values, in the specified direction.
-static void emitReductionListCopy(
- CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
- ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
- CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
-
- CodeGenModule &CGM = CGF.CGM;
- ASTContext &C = CGM.getContext();
- CGBuilderTy &Bld = CGF.Builder;
-
- llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
-
- // Iterates, element-by-element, through the source Reduce list and
- // make a copy.
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- Address SrcElementAddr = Address::invalid();
- Address DestElementAddr = Address::invalid();
- Address DestElementPtrAddr = Address::invalid();
- // Should we shuffle in an element from a remote lane?
- bool ShuffleInElement = false;
- // Set to true to update the pointer in the dest Reduce list to a
- // newly created element.
- bool UpdateDestListPtr = false;
- QualType PrivatePtrType = C.getPointerType(Private->getType());
- llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
-
- switch (Action) {
- case RemoteLaneToThread: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
-
- // Step 1.2: Create a temporary to store the element in the destination
- // Reduce list.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr =
- CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
- ShuffleInElement = true;
- UpdateDestListPtr = true;
- break;
- }
- case ThreadCopy: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
-
- // Step 1.2: Get the address for dest element. The destination
- // element has already been created on the thread's stack.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr = CGF.EmitLoadOfPointer(
- DestElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
- break;
- }
- }
-
- // Regardless of src and dest of copy, we emit the load of src
- // element as this is required in all directions
- SrcElementAddr = SrcElementAddr.withElementType(
- CGF.ConvertTypeForMem(Private->getType()));
- DestElementAddr =
- DestElementAddr.withElementType(SrcElementAddr.getElementType());
-
- // Now that all active lanes have read the element in the
- // Reduce list, shuffle over the value from the remote lane.
- if (ShuffleInElement) {
- shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
- RemoteLaneOffset, Private->getExprLoc());
- } else {
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *Elem = CGF.EmitLoadOfScalar(
- SrcElementAddr, /*Volatile=*/false, Private->getType(),
- Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(
- Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
- CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
- Private->getExprLoc());
- CGF.EmitStoreOfComplex(
- Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
- /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(
- CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
- CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- break;
- }
- }
-
- // Step 3.1: Modify reference in dest Reduce list as needed.
- // Modifying the reference in Reduce list to point to the newly
- // created element. The element is live in the current function
- // scope and that of functions it invokes (i.e., reduce_function).
- // RemoteReduceData[i] = (void*)&RemoteElem
- if (UpdateDestListPtr) {
- CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
- DestElementAddr.getPointer(), CGF.VoidPtrTy),
- DestElementPtrAddr, /*Volatile=*/false,
- C.VoidPtrTy);
- }
-
- ++Idx;
- }
-}
-
-/// This function emits a helper that gathers Reduce lists from the first
-/// lane of every active warp to lanes in the first warp.
-///
-/// void inter_warp_copy_func(void* reduce_data, num_warps)
-/// shared smem[warp_size];
-/// For all data entries D in reduce_data:
-/// sync
-/// If (I am the first lane in each warp)
-/// Copy my local D to smem[warp_id]
-/// sync
-/// if (I am the first warp)
-/// Copy smem[thread_id] to my local D
-static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
- ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy,
- SourceLocation Loc) {
- ASTContext &C = CGM.getContext();
- llvm::Module &M = CGM.getModule();
-
- // ReduceList: thread local Reduce list.
- // At the stage of the computation when this function is called, partially
- // aggregated values reside in the first lane of every active warp.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // NumWarps: number of warps active in the parallel region. This could
- // be smaller than 32 (max warps in a CTA) for partial block reduction.
- ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.getIntTypeForBitwidth(32, /* Signed */ true),
- ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&ReduceListArg);
- Args.push_back(&NumWarpsArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
- llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_inter_warp_copy_func", &M);
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- // This array is used as a medium to transfer, one reduce element at a time,
- // the data from the first lane of every warp to lanes in the first warp
- // in order to perform the final step of a reduction in a parallel region
- // (reduction across warps). The array is placed in NVPTX __shared__ memory
- // for reduced latency, as well as to have a distinct copy for concurrently
- // executing target regions. The array is declared with common linkage so
- // as to be shared across compilation units.
- StringRef TransferMediumName =
- "__openmp_nvptx_data_transfer_temporary_storage";
- llvm::GlobalVariable *TransferMedium =
- M.getGlobalVariable(TransferMediumName);
- unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
- if (!TransferMedium) {
- auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
- unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
- TransferMedium = new llvm::GlobalVariable(
- M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
- llvm::UndefValue::get(Ty), TransferMediumName,
- /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
- SharedAddressSpace);
- CGM.addCompilerUsedGlobal(TransferMedium);
- }
-
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- // Get the CUDA thread id of the current OpenMP thread on the GPU.
- llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
- // nvptx_lane_id = nvptx_id % warpsize
- llvm::Value *LaneID = getNVPTXLaneID(CGF);
- // nvptx_warp_id = nvptx_id / warpsize
- llvm::Value *WarpID = getNVPTXWarpID(CGF);
-
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
- ElemTy->getPointerTo()),
- ElemTy, CGF.getPointerAlign());
-
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- //
- // Warp master copies reduce element to transfer medium in __shared__
- // memory.
- //
- unsigned RealTySize =
- C.getTypeSizeInChars(Private->getType())
- .alignTo(C.getTypeAlignInChars(Private->getType()))
- .getQuantity();
- for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
- unsigned NumIters = RealTySize / TySize;
- if (NumIters == 0)
- continue;
- QualType CType = C.getIntTypeForBitwidth(
- C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
- llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
- CharUnits Align = CharUnits::fromQuantity(TySize);
- llvm::Value *Cnt = nullptr;
- Address CntAddr = Address::invalid();
- llvm::BasicBlock *PrecondBB = nullptr;
- llvm::BasicBlock *ExitBB = nullptr;
- if (NumIters > 1) {
- CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
- CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
- /*Volatile=*/false, C.IntTy);
- PrecondBB = CGF.createBasicBlock("precond");
- ExitBB = CGF.createBasicBlock("exit");
- llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
- // There is no need to emit line number for unconditional branch.
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(PrecondBB);
- Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
- llvm::Value *Cmp =
- Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
- Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
- CGF.EmitBlock(BodyBB);
- }
- // kmpc_barrier.
- CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
- /*EmitChecks=*/false,
- /*ForceSimpleCall=*/true);
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-
- // if (lane_id == 0)
- llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
- Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
- CGF.EmitBlock(ThenBB);
-
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- Address ElemPtr(ElemPtrPtr, CopyType, Align);
- if (NumIters > 1)
- ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
-
- // Get pointer to location in transfer medium.
- // MediumPtr = &medium[warp_id]
- llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium->getValueType(), TransferMedium,
- {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
- // Casting to actual data type.
- // MediumPtr = (CopyType*)MediumPtrAddr;
- Address MediumPtr(MediumPtrVal, CopyType, Align);
-
- // elem = *elemptr
- //*MediumPtr = elem
- llvm::Value *Elem = CGF.EmitLoadOfScalar(
- ElemPtr, /*Volatile=*/false, CType, Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
-
- Bld.CreateBr(MergeBB);
-
- CGF.EmitBlock(ElseBB);
- Bld.CreateBr(MergeBB);
-
- CGF.EmitBlock(MergeBB);
-
- // kmpc_barrier.
- CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
- /*EmitChecks=*/false,
- /*ForceSimpleCall=*/true);
-
- //
- // Warp 0 copies reduce element from transfer medium.
- //
- llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
-
- Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
- llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
- AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
-
- // Up to 32 threads in warp 0 are active.
- llvm::Value *IsActiveThread =
- Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
- Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
-
- CGF.EmitBlock(W0ThenBB);
-
- // SrcMediumPtr = &medium[tid]
- llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium->getValueType(), TransferMedium,
- {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
- // SrcMediumVal = *SrcMediumPtr;
- Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
-
- // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
- Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
- TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
- Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
- if (NumIters > 1)
- TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
-
- // *TargetElemPtr = SrcMediumVal;
- llvm::Value *SrcMediumValue =
- CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
- CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
- CType);
- Bld.CreateBr(W0MergeBB);
-
- CGF.EmitBlock(W0ElseBB);
- Bld.CreateBr(W0MergeBB);
-
- CGF.EmitBlock(W0MergeBB);
-
- if (NumIters > 1) {
- Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
- CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
- CGF.EmitBranch(PrecondBB);
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(ExitBB);
- }
- RealTySize %= TySize;
- }
- ++Idx;
- }
-
- CGF.FinishFunction();
- return Fn;
-}
-
-/// Emit a helper that reduces data across two OpenMP threads (lanes)
-/// in the same warp. It uses shuffle instructions to copy over data from
-/// a remote lane's stack. The reduction algorithm performed is specified
-/// by the fourth parameter.
-///
-/// Algorithm Versions.
-/// Full Warp Reduce (argument value 0):
-/// This algorithm assumes that all 32 lanes are active and gathers
-/// data from these 32 lanes, producing a single resultant value.
-/// Contiguous Partial Warp Reduce (argument value 1):
-/// This algorithm assumes that only a *contiguous* subset of lanes
-/// are active. This happens for the last warp in a parallel region
-/// when the user specified num_threads is not an integer multiple of
-/// 32. This contiguous subset always starts with the zeroth lane.
-/// Partial Warp Reduce (argument value 2):
-/// This algorithm gathers data from any number of lanes at any position.
-/// All reduced values are stored in the lowest possible lane. The set
-/// of problems every algorithm addresses is a super set of those
-/// addressable by algorithms with a lower version number. Overhead
-/// increases as algorithm version increases.
-///
-/// Terminology
-/// Reduce element:
-/// Reduce element refers to the individual data field with primitive
-/// data types to be combined and reduced across threads.
-/// Reduce list:
-/// Reduce list refers to a collection of local, thread-private
-/// reduce elements.
-/// Remote Reduce list:
-/// Remote Reduce list refers to a collection of remote (relative to
-/// the current thread) reduce elements.
-///
-/// We distinguish between three states of threads that are important to
-/// the implementation of this function.
-/// Alive threads:
-/// Threads in a warp executing the SIMT instruction, as distinguished from
-/// threads that are inactive due to divergent control flow.
-/// Active threads:
-/// The minimal set of threads that has to be alive upon entry to this
-/// function. The computation is correct iff active threads are alive.
-/// Some threads are alive but they are not active because they do not
-/// contribute to the computation in any useful manner. Turning them off
-/// may introduce control flow overheads without any tangible benefits.
-/// Effective threads:
-/// In order to comply with the argument requirements of the shuffle
-/// function, we must keep all lanes holding data alive. But at most
-/// half of them perform value aggregation; we refer to this half of
-/// threads as effective. The other half is simply handing off their
-/// data.
-///
-/// Procedure
-/// Value shuffle:
-/// In this step active threads transfer data from higher lane positions
-/// in the warp to lower lane positions, creating Remote Reduce list.
-/// Value aggregation:
-/// In this step, effective threads combine their thread local Reduce list
-/// with Remote Reduce list and store the result in the thread local
-/// Reduce list.
-/// Value copy:
-/// In this step, we deal with the assumption made by algorithm 2
-/// (i.e. contiguity assumption). When we have an odd number of lanes
-/// active, say 2k+1, only k threads will be effective and therefore k
-/// new values will be produced. However, the Reduce list owned by the
-/// (2k+1)th thread is ignored in the value aggregation. Therefore
-/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
-/// that the contiguity assumption still holds.
-static llvm::Function *emitShuffleAndReduceFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
- ASTContext &C = CGM.getContext();
-
- // Thread local Reduce list used to host the values of data to be reduced.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // Current lane id; could be logical.
- ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
- ImplicitParamKind::Other);
- // Offset of the remote source lane relative to the current lane.
- ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.ShortTy, ImplicitParamKind::Other);
- // Algorithm version. This is expected to be known at compile time.
- ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.ShortTy, ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&ReduceListArg);
- Args.push_back(&LaneIDArg);
- Args.push_back(&RemoteLaneOffsetArg);
- Args.push_back(&AlgoVerArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
-
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, SourceLocation()),
- ElemTy->getPointerTo()),
- ElemTy, CGF.getPointerAlign());
-
- Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
- llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
- AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
- Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
- llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
- AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
- Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
- llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
- AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
- // Create a local thread-private variable to host the Reduce list
- // from a remote lane.
- Address RemoteReduceList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
-
- // This loop iterates through the list of reduce elements and copies,
- // element by element, from a remote lane in the warp to RemoteReduceList,
- // hosted on the thread's stack.
- emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
- LocalReduceList, RemoteReduceList,
- {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
- /*ScratchpadIndex=*/nullptr,
- /*ScratchpadWidth=*/nullptr});
-
- // The actions to be performed on the Remote Reduce list is dependent
- // on the algorithm version.
- //
- // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
- // LaneId % 2 == 0 && Offset > 0):
- // do the reduction value aggregation
- //
- // The thread local variable Reduce list is mutated in place to host the
- // reduced data, which is the aggregated value produced from local and
- // remote lanes.
- //
- // Note that AlgoVer is expected to be a constant integer known at compile
- // time.
- // When AlgoVer==0, the first conjunction evaluates to true, making
- // the entire predicate true during compile time.
- // When AlgoVer==1, the second conjunction has only the second part to be
- // evaluated during runtime. Other conjunctions evaluates to false
- // during compile time.
- // When AlgoVer==2, the third conjunction has only the second part to be
- // evaluated during runtime. Other conjunctions evaluates to false
- // during compile time.
- llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
-
- llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
- llvm::Value *CondAlgo1 = Bld.CreateAnd(
- Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
-
- llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
- llvm::Value *CondAlgo2 = Bld.CreateAnd(
- Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
- CondAlgo2 = Bld.CreateAnd(
- CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
-
- llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
- CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
-
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
- Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
-
- CGF.EmitBlock(ThenBB);
- // reduce_function(LocalReduceList, RemoteReduceList)
- llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- LocalReduceList.getPointer(), CGF.VoidPtrTy);
- llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- RemoteReduceList.getPointer(), CGF.VoidPtrTy);
- CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
- CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
- Bld.CreateBr(MergeBB);
-
- CGF.EmitBlock(ElseBB);
- Bld.CreateBr(MergeBB);
-
- CGF.EmitBlock(MergeBB);
-
- // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
- // Reduce list.
- Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
- llvm::Value *CondCopy = Bld.CreateAnd(
- Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
-
- llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
- Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
-
- CGF.EmitBlock(CpyThenBB);
- emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
- RemoteReduceList, LocalReduceList);
- Bld.CreateBr(CpyMergeBB);
-
- CGF.EmitBlock(CpyElseBB);
- Bld.CreateBr(CpyMergeBB);
-
- CGF.EmitBlock(CpyMergeBB);
-
- CGF.FinishFunction();
- return Fn;
-}
-
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-/// For all data entries D in reduce_data:
-/// Copy local D to buffer.D[Idx]
-static llvm::Value *emitListToGlobalCopyFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap) {
- ASTContext &C = CGM.getContext();
-
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamKind::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, Loc),
- ElemTy->getPointerTo()),
- ElemTy, CGF.getPointerAlign());
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
- llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- ElemTy = CGF.ConvertTypeForMem(Private->getType());
- ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtrPtr, ElemTy->getPointerTo());
- Address ElemPtr =
- Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
- const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
- // Global = Buffer.VD[Idx];
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- llvm::Value *BufferPtr =
- Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- GlobLVal.setAddress(Address(GlobAddr.getPointer(),
- CGF.ConvertTypeForMem(Private->getType()),
- GlobAddr.getAlignment()));
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *V = CGF.EmitLoadOfScalar(
- ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- CGF.EmitStoreOfScalar(V, GlobLVal);
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
- CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
- CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(GlobLVal,
- CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- break;
- }
- ++Idx;
- }
-
- CGF.FinishFunction();
- return Fn;
-}
-
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
-/// void *GlobPtrs[];
-/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
-/// ...
-/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
-/// reduce_function(GlobPtrs, reduce_data);
-static llvm::Value *emitListToGlobalReduceFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap,
- llvm::Function *ReduceFn) {
- ASTContext &C = CGM.getContext();
-
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamKind::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
-
- // 1. Build a list of reduction variables.
- // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
- Address ReductionList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
- auto IPriv = Privates.begin();
- llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
- Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- // Global = Buffer.VD[Idx];
- const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- llvm::Value *BufferPtr =
- Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /*Volatile=*/false,
- C.VoidPtrTy);
- if ((*IPriv)->getType()->isVariablyModifiedType()) {
- // Store array size.
- ++Idx;
- Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- llvm::Value *Size = CGF.Builder.CreateIntCast(
- CGF.getVLASize(
- CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
- .NumElts,
- CGF.SizeTy, /*isSigned=*/false);
- CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
- Elem);
- }
- }
-
- // Call reduce_function(GlobalReduceList, ReduceList)
- llvm::Value *GlobalReduceList = ReductionList.getPointer();
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
- CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
- CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
- CGF.FinishFunction();
- return Fn;
-}
-
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-/// For all data entries D in reduce_data:
-/// Copy buffer.D[Idx] to local D;
-static llvm::Value *emitGlobalToListCopyFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap) {
- ASTContext &C = CGM.getContext();
-
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamKind::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, Loc),
- ElemTy->getPointerTo()),
- ElemTy, CGF.getPointerAlign());
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
-
- llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- ElemTy = CGF.ConvertTypeForMem(Private->getType());
- ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtrPtr, ElemTy->getPointerTo());
- Address ElemPtr =
- Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
- const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
- // Global = Buffer.VD[Idx];
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- llvm::Value *BufferPtr =
- Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- GlobLVal.setAddress(Address(GlobAddr.getPointer(),
- CGF.ConvertTypeForMem(Private->getType()),
- GlobAddr.getAlignment()));
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
- CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
- CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- GlobLVal, Private->getType(),
- AggValueSlot::DoesNotOverlap);
- break;
- }
- ++Idx;
- }
-
- CGF.FinishFunction();
- return Fn;
-}
-
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
-/// void *GlobPtrs[];
-/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
-/// ...
-/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
-/// reduce_function(reduce_data, GlobPtrs);
-static llvm::Value *emitGlobalToListReduceFunction(
- CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy, SourceLocation Loc,
- const RecordDecl *TeamReductionRec,
- const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
- &VarFieldMap,
- llvm::Function *ReduceFn) {
- ASTContext &C = CGM.getContext();
-
- // Buffer: global reduction buffer.
- ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // Idx: index of the buffer.
- ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
- ImplicitParamKind::Other);
- // ReduceList: thread local Reduce list.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&BufferArg);
- Args.push_back(&IdxArg);
- Args.push_back(&ReduceListArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
- QualType StaticTy = C.getRecordType(TeamReductionRec);
- llvm::Type *LLVMReductionsBufferTy =
- CGM.getTypes().ConvertTypeForMem(StaticTy);
- llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
- LLVMReductionsBufferTy->getPointerTo());
-
- // 1. Build a list of reduction variables.
- // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
- Address ReductionList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
- auto IPriv = Privates.begin();
- llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
- /*Volatile=*/false, C.IntTy,
- Loc)};
- unsigned Idx = 0;
- for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
- Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- // Global = Buffer.VD[Idx];
- const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
- const FieldDecl *FD = VarFieldMap.lookup(VD);
- llvm::Value *BufferPtr =
- Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
- LValue GlobLVal = CGF.EmitLValueForField(
- CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
- Address GlobAddr = GlobLVal.getAddress(CGF);
- CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /*Volatile=*/false,
- C.VoidPtrTy);
- if ((*IPriv)->getType()->isVariablyModifiedType()) {
- // Store array size.
- ++Idx;
- Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- llvm::Value *Size = CGF.Builder.CreateIntCast(
- CGF.getVLASize(
- CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
- .NumElts,
- CGF.SizeTy, /*isSigned=*/false);
- CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
- Elem);
- }
- }
-
- // Call reduce_function(ReduceList, GlobalReduceList)
- llvm::Value *GlobalReduceList = ReductionList.getPointer();
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
- CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
- CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
- CGF.FinishFunction();
- return Fn;
-}
-
///
/// Design of OpenMP reductions on the GPU
///
@@ -2805,21 +1660,20 @@ void CGOpenMPRuntimeGPU::emitReduction(
return;
bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
-#ifndef NDEBUG
+ bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
-#endif
+
+ ASTContext &C = CGM.getContext();
if (Options.SimpleReduction) {
assert(!TeamsReduction && !ParallelReduction &&
"Invalid reduction selection in emitReduction.");
+ (void)ParallelReduction;
CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
ReductionOps, Options);
return;
}
- assert((TeamsReduction || ParallelReduction) &&
- "Invalid reduction selection in emitReduction.");
-
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
int Cnt = 0;
@@ -2827,145 +1681,85 @@ void CGOpenMPRuntimeGPU::emitReduction(
PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
++Cnt;
}
-
- ASTContext &C = CGM.getContext();
const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
- // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
- // RedList, shuffle_reduce_func, interwarp_copy_func);
- // or
- // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
- llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
-
- llvm::Value *Res;
- // 1. Build a list of reduction variables.
- // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
- auto Size = RHSExprs.size();
- for (const Expr *E : Privates) {
- if (E->getType()->isVariablyModifiedType())
- // Reserve place for array size.
- ++Size;
- }
- llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
- QualType ReductionArrayTy = C.getConstantArrayType(
- C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
- /*IndexTypeQuals=*/0);
- Address ReductionList =
- CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
- auto IPriv = Privates.begin();
- unsigned Idx = 0;
- for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
- Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- CGF.Builder.CreateStore(
- CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
- Elem);
- if ((*IPriv)->getType()->isVariablyModifiedType()) {
- // Store array size.
- ++Idx;
- Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
- llvm::Value *Size = CGF.Builder.CreateIntCast(
- CGF.getVLASize(
- CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
- .NumElts,
- CGF.SizeTy, /*isSigned=*/false);
- CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
- Elem);
- }
- }
-
- llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
- ReductionList.getPointer(), CGF.VoidPtrTy);
- llvm::Function *ReductionFn = emitReductionFunction(
- CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
- Privates, LHSExprs, RHSExprs, ReductionOps);
- llvm::Value *ReductionDataSize =
- CGF.getTypeSize(C.getRecordType(ReductionRec));
- ReductionDataSize =
- CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
- llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
- CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
- llvm::Value *InterWarpCopyFn =
- emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
-
- if (ParallelReduction) {
- llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
- InterWarpCopyFn};
-
- Res = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
- Args);
- } else {
- assert(TeamsReduction && "expected teams reduction.");
+ if (TeamsReduction)
TeamsReductions.push_back(ReductionRec);
- auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
- {}, "_openmp_teams_reductions_buffer_$_$ptr");
- llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
- llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
- ReductionFn);
- llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
- llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
- CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
- ReductionFn);
- llvm::Value *Args[] = {
- RTLoc,
- KernelTeamsReductionPtr,
- CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
- ReductionDataSize,
- RL,
- ShuffleAndReduceFn,
- InterWarpCopyFn,
- GlobalToBufferCpyFn,
- GlobalToBufferRedFn,
- BufferToGlobalCpyFn,
- BufferToGlobalRedFn};
-
- Res = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
- Args);
- }
+ // Source location for the ident struct
+ llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
- // 5. Build if (res == 1)
- llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
- llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
- Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
- CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
-
- // 6. Build then branch: where we have reduced values in the master
- // thread in each team.
- // __kmpc_end_reduce{_nowait}(<gtid>);
- // break;
- CGF.EmitBlock(ThenBB);
-
- // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
- auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
- this](CodeGenFunction &CGF, PrePostActionTy &Action) {
- auto IPriv = Privates.begin();
- auto ILHS = LHSExprs.begin();
- auto IRHS = RHSExprs.begin();
- for (const Expr *E : ReductionOps) {
- emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
- cast<DeclRefExpr>(*IRHS));
- ++IPriv;
- ++ILHS;
- ++IRHS;
+ using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+ InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
+ CGF.AllocaInsertPt->getIterator());
+ InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
+ CGF.Builder.GetInsertPoint());
+ llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
+ CodeGenIP, CGF.SourceLocToDebugLoc(Loc));
+ llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
+
+ CodeGenFunction::OMPPrivateScope Scope(CGF);
+ unsigned Idx = 0;
+ for (const Expr *Private : Privates) {
+ llvm::Type *ElementType;
+ llvm::Value *Variable;
+ llvm::Value *PrivateVariable;
+ llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
+ ElementType = CGF.ConvertTypeForMem(Private->getType());
+ const auto *RHSVar =
+ cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
+ PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF);
+ const auto *LHSVar =
+ cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
+ Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF);
+ llvm::OpenMPIRBuilder::EvalKind EvalKind;
+ switch (CGF.getEvaluationKind(Private->getType())) {
+ case TEK_Scalar:
+ EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
+ break;
+ case TEK_Complex:
+ EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
+ break;
+ case TEK_Aggregate:
+ EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
+ break;
}
- };
- RegionCodeGenTy RCG(CodeGen);
- RCG(CGF);
- // There is no need to emit line number for unconditional branch.
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
+ auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
+ llvm::Value **LHSPtr, llvm::Value **RHSPtr,
+ llvm::Function *NewFunc) {
+ CGF.Builder.restoreIP(CodeGenIP);
+ auto *CurFn = CGF.CurFn;
+ CGF.CurFn = NewFunc;
+
+ *LHSPtr = CGF.GetAddrOfLocalVar(
+ cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
+ .emitRawPointer(CGF);
+ *RHSPtr = CGF.GetAddrOfLocalVar(
+ cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
+ .emitRawPointer(CGF);
+
+ emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
+ cast<DeclRefExpr>(LHSExprs[I]),
+ cast<DeclRefExpr>(RHSExprs[I]));
+
+ CGF.CurFn = CurFn;
+
+ return InsertPointTy(CGF.Builder.GetInsertBlock(),
+ CGF.Builder.GetInsertPoint());
+ };
+ ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
+ ElementType, Variable, PrivateVariable, EvalKind,
+ /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
+ Idx++;
+ }
+
+ CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
+ OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
+ DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
+ CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
+ RTLoc));
+ return;
}
const VarDecl *
@@ -3106,15 +1900,15 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
// Get the array of arguments.
SmallVector<llvm::Value *, 8> Args;
- Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
- Args.emplace_back(ZeroAddr.getPointer());
+ Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
+ Args.emplace_back(ZeroAddr.emitRawPointer(CGF));
CGBuilderTy &Bld = CGF.Builder;
auto CI = CS.capture_begin();
// Use global memory for data sharing.
// Handle passing of global args to workers.
- Address GlobalArgs =
+ RawAddress GlobalArgs =
CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
@@ -3399,8 +2193,8 @@ void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
if (VD->getType().getCanonicalType()->isReferenceType())
VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
VD->getType().getCanonicalType())
- .getAddress(CGF);
- CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
+ .getAddress();
+ CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
}
}
}
@@ -3434,106 +2228,112 @@ bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
return false;
}
-// Get current CudaArch and ignore any unknown values
-static CudaArch getCudaArch(CodeGenModule &CGM) {
+// Get current OffloadArch and ignore any unknown values
+static OffloadArch getOffloadArch(CodeGenModule &CGM) {
if (!CGM.getTarget().hasFeature("ptx"))
- return CudaArch::UNKNOWN;
+ return OffloadArch::UNKNOWN;
for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
if (Feature.getValue()) {
- CudaArch Arch = StringToCudaArch(Feature.getKey());
- if (Arch != CudaArch::UNKNOWN)
+ OffloadArch Arch = StringToOffloadArch(Feature.getKey());
+ if (Arch != OffloadArch::UNKNOWN)
return Arch;
}
}
- return CudaArch::UNKNOWN;
+ return OffloadArch::UNKNOWN;
}
/// Check to see if target architecture supports unified addressing which is
/// a restriction for OpenMP requires clause "unified_shared_memory".
-void CGOpenMPRuntimeGPU::processRequiresDirective(
- const OMPRequiresDecl *D) {
+void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
for (const OMPClause *Clause : D->clauselists()) {
if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
- CudaArch Arch = getCudaArch(CGM);
+ OffloadArch Arch = getOffloadArch(CGM);
switch (Arch) {
- case CudaArch::SM_20:
- case CudaArch::SM_21:
- case CudaArch::SM_30:
- case CudaArch::SM_32:
- case CudaArch::SM_35:
- case CudaArch::SM_37:
- case CudaArch::SM_50:
- case CudaArch::SM_52:
- case CudaArch::SM_53: {
+ case OffloadArch::SM_20:
+ case OffloadArch::SM_21:
+ case OffloadArch::SM_30:
+ case OffloadArch::SM_32_:
+ case OffloadArch::SM_35:
+ case OffloadArch::SM_37:
+ case OffloadArch::SM_50:
+ case OffloadArch::SM_52:
+ case OffloadArch::SM_53: {
SmallString<256> Buffer;
llvm::raw_svector_ostream Out(Buffer);
- Out << "Target architecture " << CudaArchToString(Arch)
+ Out << "Target architecture " << OffloadArchToString(Arch)
<< " does not support unified addressing";
CGM.Error(Clause->getBeginLoc(), Out.str());
return;
}
- case CudaArch::SM_60:
- case CudaArch::SM_61:
- case CudaArch::SM_62:
- case CudaArch::SM_70:
- case CudaArch::SM_72:
- case CudaArch::SM_75:
- case CudaArch::SM_80:
- case CudaArch::SM_86:
- case CudaArch::SM_87:
- case CudaArch::SM_89:
- case CudaArch::SM_90:
- case CudaArch::SM_90a:
- case CudaArch::GFX600:
- case CudaArch::GFX601:
- case CudaArch::GFX602:
- case CudaArch::GFX700:
- case CudaArch::GFX701:
- case CudaArch::GFX702:
- case CudaArch::GFX703:
- case CudaArch::GFX704:
- case CudaArch::GFX705:
- case CudaArch::GFX801:
- case CudaArch::GFX802:
- case CudaArch::GFX803:
- case CudaArch::GFX805:
- case CudaArch::GFX810:
- case CudaArch::GFX900:
- case CudaArch::GFX902:
- case CudaArch::GFX904:
- case CudaArch::GFX906:
- case CudaArch::GFX908:
- case CudaArch::GFX909:
- case CudaArch::GFX90a:
- case CudaArch::GFX90c:
- case CudaArch::GFX940:
- case CudaArch::GFX941:
- case CudaArch::GFX942:
- case CudaArch::GFX1010:
- case CudaArch::GFX1011:
- case CudaArch::GFX1012:
- case CudaArch::GFX1013:
- case CudaArch::GFX1030:
- case CudaArch::GFX1031:
- case CudaArch::GFX1032:
- case CudaArch::GFX1033:
- case CudaArch::GFX1034:
- case CudaArch::GFX1035:
- case CudaArch::GFX1036:
- case CudaArch::GFX1100:
- case CudaArch::GFX1101:
- case CudaArch::GFX1102:
- case CudaArch::GFX1103:
- case CudaArch::GFX1150:
- case CudaArch::GFX1151:
- case CudaArch::GFX1200:
- case CudaArch::GFX1201:
- case CudaArch::Generic:
- case CudaArch::UNUSED:
- case CudaArch::UNKNOWN:
+ case OffloadArch::SM_60:
+ case OffloadArch::SM_61:
+ case OffloadArch::SM_62:
+ case OffloadArch::SM_70:
+ case OffloadArch::SM_72:
+ case OffloadArch::SM_75:
+ case OffloadArch::SM_80:
+ case OffloadArch::SM_86:
+ case OffloadArch::SM_87:
+ case OffloadArch::SM_89:
+ case OffloadArch::SM_90:
+ case OffloadArch::SM_90a:
+ case OffloadArch::GFX600:
+ case OffloadArch::GFX601:
+ case OffloadArch::GFX602:
+ case OffloadArch::GFX700:
+ case OffloadArch::GFX701:
+ case OffloadArch::GFX702:
+ case OffloadArch::GFX703:
+ case OffloadArch::GFX704:
+ case OffloadArch::GFX705:
+ case OffloadArch::GFX801:
+ case OffloadArch::GFX802:
+ case OffloadArch::GFX803:
+ case OffloadArch::GFX805:
+ case OffloadArch::GFX810:
+ case OffloadArch::GFX9_GENERIC:
+ case OffloadArch::GFX900:
+ case OffloadArch::GFX902:
+ case OffloadArch::GFX904:
+ case OffloadArch::GFX906:
+ case OffloadArch::GFX908:
+ case OffloadArch::GFX909:
+ case OffloadArch::GFX90a:
+ case OffloadArch::GFX90c:
+ case OffloadArch::GFX940:
+ case OffloadArch::GFX941:
+ case OffloadArch::GFX942:
+ case OffloadArch::GFX10_1_GENERIC:
+ case OffloadArch::GFX1010:
+ case OffloadArch::GFX1011:
+ case OffloadArch::GFX1012:
+ case OffloadArch::GFX1013:
+ case OffloadArch::GFX10_3_GENERIC:
+ case OffloadArch::GFX1030:
+ case OffloadArch::GFX1031:
+ case OffloadArch::GFX1032:
+ case OffloadArch::GFX1033:
+ case OffloadArch::GFX1034:
+ case OffloadArch::GFX1035:
+ case OffloadArch::GFX1036:
+ case OffloadArch::GFX11_GENERIC:
+ case OffloadArch::GFX1100:
+ case OffloadArch::GFX1101:
+ case OffloadArch::GFX1102:
+ case OffloadArch::GFX1103:
+ case OffloadArch::GFX1150:
+ case OffloadArch::GFX1151:
+ case OffloadArch::GFX1152:
+ case OffloadArch::GFX12_GENERIC:
+ case OffloadArch::GFX1200:
+ case OffloadArch::GFX1201:
+ case OffloadArch::AMDGCNSPIRV:
+ case OffloadArch::Generic:
+ case OffloadArch::UNUSED:
+ case OffloadArch::UNKNOWN:
break;
- case CudaArch::LAST:
- llvm_unreachable("Unexpected Cuda arch.");
+ case OffloadArch::LAST:
+ llvm_unreachable("Unexpected GPU arch.");
}
}
}
@@ -3560,10 +2360,3 @@ llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
Args);
}
-
-llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
- ArrayRef<llvm::Value *> Args{};
- return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_get_warp_size),
- Args);
-}