diff options
Diffstat (limited to 'clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp')
| -rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 516 |
1 files changed, 187 insertions, 329 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 62aacb9e24d6..293ccaa3413c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -85,18 +85,6 @@ public: ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; } }; -/// GPU Configuration: This information can be derived from cuda registers, -/// however, providing compile time constants helps generate more efficient -/// code. For all practical purposes this is fine because the configuration -/// is the same for all known NVPTX architectures. -enum MachineConfiguration : unsigned { - /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target - /// specific Grid Values like GV_Warp_Size, GV_Slot_Size - - /// Global memory alignment for performance. - GlobalMemoryAlignment = 128, -}; - static const ValueDecl *getPrivateItem(const Expr *RefExpr) { RefExpr = RefExpr->IgnoreParens(); if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { @@ -119,31 +107,23 @@ static const ValueDecl *getPrivateItem(const Expr *RefExpr) { return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); } - static RecordDecl *buildRecordForGlobalizedVars( ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, ArrayRef<const ValueDecl *> EscapedDeclsForTeams, llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> - &MappedDeclsFields, int BufSize) { + &MappedDeclsFields, + int BufSize) { using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) return nullptr; SmallVector<VarsDataTy, 4> GlobalizedVars; for (const ValueDecl *D : EscapedDecls) - GlobalizedVars.emplace_back( - CharUnits::fromQuantity(std::max( - C.getDeclAlign(D).getQuantity(), - static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))), - D); + GlobalizedVars.emplace_back(C.getDeclAlign(D), D); for (const ValueDecl *D : EscapedDeclsForTeams) GlobalizedVars.emplace_back(C.getDeclAlign(D), D); - llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) { - return L.first > R.first; - }); // Build struct _globalized_locals_ty { - // /* globalized vars */[WarSize] align (max(decl_align, - // GlobalMemoryAlignment)) + // /* globalized vars */[WarSize] align (decl_align) // /* globalized vars */ for EscapedDeclsForTeams // }; RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); @@ -173,18 +153,18 @@ static RecordDecl *buildRecordForGlobalizedVars( Field->addAttr(*I); } } else { - llvm::APInt ArraySize(32, BufSize); - Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal, - 0); + if (BufSize > 1) { + llvm::APInt ArraySize(32, BufSize); + Type = C.getConstantArrayType(Type, ArraySize, nullptr, + ArraySizeModifier::Normal, 0); + } Field = FieldDecl::Create( C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, C.getTrivialTypeSourceInfo(Type, SourceLocation()), /*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit); Field->setAccess(AS_public); - llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(), - static_cast<CharUnits::QuantityType>( - GlobalMemoryAlignment))); + llvm::APInt Align(32, Pair.first.getQuantity()); Field->addAttr(AlignedAttr::CreateImplicit( C, /*IsAlignmentExpr=*/true, IntegerLiteral::Create(C, Align, @@ -551,10 +531,9 @@ CGOpenMPRuntimeGPU::getExecutionMode() const { return CurrentExecutionMode; } -static CGOpenMPRuntimeGPU::DataSharingMode -getDataSharingMode(CodeGenModule &CGM) { - return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA - : CGOpenMPRuntimeGPU::Generic; +CGOpenMPRuntimeGPU::DataSharingMode +CGOpenMPRuntimeGPU::getDataSharingMode() const { + return CurrentDataSharingMode; } /// Check for inner (nested) SPMD construct, if any @@ -752,27 +731,30 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, EntryFunctionState EST; WrapperFunctionsMap.clear(); + [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); + assert(!IsBareKernel && "bare kernel should not be at generic mode"); + // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { CGOpenMPRuntimeGPU::EntryFunctionState &EST; + const OMPExecutableDirective &D; public: - NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST) - : EST(EST) {} + NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, + const OMPExecutableDirective &D) + : EST(EST), D(D) {} void Enter(CodeGenFunction &CGF) override { - auto &RT = - static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); - RT.emitKernelInit(CGF, EST, /* IsSPMD */ false); + auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); + RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false); // Skip target region initialization. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); } void Exit(CodeGenFunction &CGF) override { - auto &RT = - static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); + auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); RT.clearLocThreadIdInsertPt(CGF); RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false); } - } Action(EST); + } Action(EST, D); CodeGen.setAction(Action); IsInTTDRegion = true; emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, @@ -780,10 +762,17 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, IsInTTDRegion = false; } -void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D, + CodeGenFunction &CGF, EntryFunctionState &EST, bool IsSPMD) { + int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1, + MaxTeamsVal = -1; + computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal, + MinTeamsVal, MaxTeamsVal); + CGBuilderTy &Bld = CGF.Builder; - Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD)); + Bld.restoreIP(OMPBuilder.createTargetInit( + Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal)); if (!IsSPMD) emitGenericVarsProlog(CGF, EST.Loc); } @@ -794,8 +783,34 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF, if (!IsSPMD) emitGenericVarsEpilog(CGF); + // This is temporary until we remove the fixed sized buffer. + ASTContext &C = CGM.getContext(); + RecordDecl *StaticRD = C.buildImplicitRecord( + "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union); + StaticRD->startDefinition(); + for (const RecordDecl *TeamReductionRec : TeamsReductions) { + QualType RecTy = C.getRecordType(TeamReductionRec); + auto *Field = FieldDecl::Create( + C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, + C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + StaticRD->addDecl(Field); + } + StaticRD->completeDefinition(); + QualType StaticTy = C.getRecordType(StaticRD); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + const auto &DL = CGM.getModule().getDataLayout(); + uint64_t ReductionDataSize = + TeamsReductions.empty() + ? 0 + : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue(); CGBuilderTy &Bld = CGF.Builder; - OMPBuilder.createTargetDeinit(Bld, IsSPMD); + OMPBuilder.createTargetDeinit(Bld, ReductionDataSize, + C.getLangOpts().OpenMPCUDAReductionBufNum); + TeamsReductions.clear(); } void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, @@ -807,25 +822,40 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD); EntryFunctionState EST; + bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); + // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { CGOpenMPRuntimeGPU &RT; CGOpenMPRuntimeGPU::EntryFunctionState &EST; + bool IsBareKernel; + DataSharingMode Mode; + const OMPExecutableDirective &D; public: NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, - CGOpenMPRuntimeGPU::EntryFunctionState &EST) - : RT(RT), EST(EST) {} + CGOpenMPRuntimeGPU::EntryFunctionState &EST, + bool IsBareKernel, const OMPExecutableDirective &D) + : RT(RT), EST(EST), IsBareKernel(IsBareKernel), + Mode(RT.CurrentDataSharingMode), D(D) {} void Enter(CodeGenFunction &CGF) override { - RT.emitKernelInit(CGF, EST, /* IsSPMD */ true); + if (IsBareKernel) { + RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA; + return; + } + RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true); // Skip target region initialization. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); } void Exit(CodeGenFunction &CGF) override { + if (IsBareKernel) { + RT.CurrentDataSharingMode = Mode; + return; + } RT.clearLocThreadIdInsertPt(CGF); RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true); } - } Action(*this, EST); + } Action(*this, EST, IsBareKernel, D); CodeGen.setAction(Action); IsInTTDRegion = true; emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, @@ -833,24 +863,6 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, IsInTTDRegion = false; } -// Create a unique global variable to indicate the execution mode of this target -// region. The execution mode is either 'generic', or 'spmd' depending on the -// target directive. This variable is picked up by the offload library to setup -// the device appropriately before kernel launch. If the execution mode is -// 'generic', the runtime reserves one warp for the master, otherwise, all -// warps participate in parallel work. -static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, - bool Mode) { - auto *GVMode = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, - llvm::GlobalValue::WeakAnyLinkage, - llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD - : OMP_TGT_EXEC_MODE_GENERIC), - Twine(Name, "_exec_mode")); - GVMode->setVisibility(llvm::GlobalVariable::ProtectedVisibility); - CGM.addCompilerUsedGlobal(GVMode); -} - void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, @@ -861,26 +873,30 @@ void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( assert(!ParentName.empty() && "Invalid target region parent name!"); bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); - if (Mode) + bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); + if (Mode || IsBareKernel) emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); else emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); - - setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); } CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) : CGOpenMPRuntime(CGM) { - llvm::OpenMPIRBuilderConfig Config(CGM.getLangOpts().OpenMPIsTargetDevice, - isGPU(), hasRequiresUnifiedSharedMemory(), - CGM.getLangOpts().OpenMPOffloadMandatory); + llvm::OpenMPIRBuilderConfig Config( + CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(), + CGM.getLangOpts().OpenMPOffloadMandatory, + /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false, + hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false); OMPBuilder.setConfig(Config); if (!CGM.getLangOpts().OpenMPIsTargetDevice) llvm_unreachable("OpenMP can only handle device code."); + if (CGM.getLangOpts().OpenMPCUDAMode) + CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA; + llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder(); if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty()) return; @@ -900,11 +916,7 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, ProcBindKind ProcBind, SourceLocation Loc) { - // Do nothing in case of SPMD mode and L0 parallel. - if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) - return; - - CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc); + // Nothing to do. } void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, @@ -1046,10 +1058,8 @@ llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( } void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, - SourceLocation Loc, - bool WithSPMDCheck) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && - getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) + SourceLocation Loc) { + if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) return; CGBuilderTy &Bld = CGF.Builder; @@ -1158,10 +1168,8 @@ void CGOpenMPRuntimeGPU::getKmpcFreeShared( {AddrSizePair.first, AddrSizePair.second}); } -void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF, - bool WithSPMDCheck) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && - getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) +void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) { + if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) return; const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); @@ -1196,11 +1204,18 @@ void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, if (!CGF.HaveInsertPoint()) return; + bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); + Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; - OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); + // We don't emit any thread id function call in bare kernel, but because the + // outlined function has a pointer argument, we emit a nullptr here. + if (IsBareKernel) + OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy)); + else + OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); OutlinedFnArgs.push_back(ZeroAddr.getPointer()); OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); @@ -1405,9 +1420,7 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, return CGF.Builder.CreateIntCast(Val, LLVMCastTy, CastTy->hasSignedIntegerRepresentation()); Address CastItem = CGF.CreateMemTemp(CastTy); - Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()), - Val->getType()); + Address ValCastItem = CastItem.withElementType(Val->getType()); CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); @@ -1543,11 +1556,6 @@ enum CopyAction : unsigned { RemoteLaneToThread, // ThreadCopy: Make a copy of a Reduce list on the thread's stack. ThreadCopy, - // ThreadToScratchpad: Copy a team-reduced array to the scratchpad. - ThreadToScratchpad, - // ScratchpadToThread: Copy from a scratchpad array in global memory - // containing team-reduced data to a thread's stack. - ScratchpadToThread, }; } // namespace @@ -1569,13 +1577,10 @@ static void emitReductionListCopy( CGBuilderTy &Bld = CGF.Builder; llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; - llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex; - llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth; // Iterates, element-by-element, through the source Reduce list and // make a copy. unsigned Idx = 0; - unsigned Size = Privates.size(); for (const Expr *Private : Privates) { Address SrcElementAddr = Address::invalid(); Address DestElementAddr = Address::invalid(); @@ -1585,10 +1590,6 @@ static void emitReductionListCopy( // Set to true to update the pointer in the dest Reduce list to a // newly created element. bool UpdateDestListPtr = false; - // Increment the src or dest pointer to the scratchpad, for each - // new element. - bool IncrScratchpadSrc = false; - bool IncrScratchpadDest = false; QualType PrivatePtrType = C.getPointerType(Private->getType()); llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType); @@ -1624,49 +1625,6 @@ static void emitReductionListCopy( PrivatePtrType->castAs<PointerType>()); break; } - case ThreadToScratchpad: { - // Step 1.1: Get the address for the src element in the Reduce list. - Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); - SrcElementAddr = CGF.EmitLoadOfPointer( - SrcElementPtrAddr.withElementType(PrivateLlvmPtrType), - PrivatePtrType->castAs<PointerType>()); - - // Step 1.2: Get the address for dest element: - // address = base + index * ElementSizeInChars. - llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); - llvm::Value *CurrentOffset = - Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex); - llvm::Value *ScratchPadElemAbsolutePtrVal = - Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset); - ScratchPadElemAbsolutePtrVal = - Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); - DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty, - C.getTypeAlignInChars(Private->getType())); - IncrScratchpadDest = true; - break; - } - case ScratchpadToThread: { - // Step 1.1: Get the address for the src element in the scratchpad. - // address = base + index * ElementSizeInChars. - llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); - llvm::Value *CurrentOffset = - Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex); - llvm::Value *ScratchPadElemAbsolutePtrVal = - Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset); - ScratchPadElemAbsolutePtrVal = - Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); - SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty, - C.getTypeAlignInChars(Private->getType())); - IncrScratchpadSrc = true; - - // Step 1.2: Create a temporary to store the element in the destination - // Reduce list. - DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); - DestElementAddr = - CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); - UpdateDestListPtr = true; - break; - } } // Regardless of src and dest of copy, we emit the load of src @@ -1724,39 +1682,6 @@ static void emitReductionListCopy( C.VoidPtrTy); } - // Step 4.1: Increment SrcBase/DestBase so that it points to the starting - // address of the next element in scratchpad memory, unless we're currently - // processing the last one. Memory alignment is also taken care of here. - if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) { - // FIXME: This code doesn't make any sense, it's trying to perform - // integer arithmetic on pointers. - llvm::Value *ScratchpadBasePtr = - IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer(); - llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); - ScratchpadBasePtr = Bld.CreateNUWAdd( - ScratchpadBasePtr, - Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars)); - - // Take care of global memory alignment for performance - ScratchpadBasePtr = Bld.CreateNUWSub( - ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1)); - ScratchpadBasePtr = Bld.CreateUDiv( - ScratchpadBasePtr, - llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); - ScratchpadBasePtr = Bld.CreateNUWAdd( - ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1)); - ScratchpadBasePtr = Bld.CreateNUWMul( - ScratchpadBasePtr, - llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); - - if (IncrScratchpadDest) - DestBase = - Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign()); - else /* IncrScratchpadSrc = true */ - SrcBase = - Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign()); - } - ++Idx; } } @@ -1784,12 +1709,12 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, // At the stage of the computation when this function is called, partially // aggregated values reside in the first lane of every active warp. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); + C.VoidPtrTy, ImplicitParamKind::Other); // NumWarps: number of warps active in the parallel region. This could // be smaller than 32 (max warps in a CTA) for partial block reduction. ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.getIntTypeForBitwidth(32, /* Signed */ true), - ImplicitParamDecl::Other); + ImplicitParamKind::Other); FunctionArgList Args; Args.push_back(&ReduceListArg); Args.push_back(&NumWarpsArg); @@ -1914,12 +1839,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); // Casting to actual data type. // MediumPtr = (CopyType*)MediumPtrAddr; - Address MediumPtr( - Bld.CreateBitCast( - MediumPtrVal, - CopyType->getPointerTo( - MediumPtrVal->getType()->getPointerAddressSpace())), - CopyType, Align); + Address MediumPtr(MediumPtrVal, CopyType, Align); // elem = *elemptr //*MediumPtr = elem @@ -1966,12 +1886,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, TransferMedium->getValueType(), TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); // SrcMediumVal = *SrcMediumPtr; - Address SrcMediumPtr( - Bld.CreateBitCast( - SrcMediumPtrVal, - CopyType->getPointerTo( - SrcMediumPtrVal->getType()->getPointerAddressSpace())), - CopyType, Align); + Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align); // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); @@ -2082,16 +1997,16 @@ static llvm::Function *emitShuffleAndReduceFunction( // Thread local Reduce list used to host the values of data to be reduced. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); + C.VoidPtrTy, ImplicitParamKind::Other); // Current lane id; could be logical. ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy, - ImplicitParamDecl::Other); + ImplicitParamKind::Other); // Offset of the remote source lane relative to the current lane. ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.ShortTy, ImplicitParamDecl::Other); + C.ShortTy, ImplicitParamKind::Other); // Algorithm version. This is expected to be known at compile time. ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.ShortTy, ImplicitParamDecl::Other); + C.ShortTy, ImplicitParamKind::Other); FunctionArgList Args; Args.push_back(&ReduceListArg); Args.push_back(&LaneIDArg); @@ -2243,13 +2158,13 @@ static llvm::Value *emitListToGlobalCopyFunction( // Buffer: global reduction buffer. ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); + C.VoidPtrTy, ImplicitParamKind::Other); // Idx: index of the buffer. ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, - ImplicitParamDecl::Other); + ImplicitParamKind::Other); // ReduceList: thread local Reduce list. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); + C.VoidPtrTy, ImplicitParamKind::Other); FunctionArgList Args; Args.push_back(&BufferArg); Args.push_back(&IdxArg); @@ -2282,8 +2197,7 @@ static llvm::Value *emitListToGlobalCopyFunction( llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), LLVMReductionsBufferTy->getPointerTo()); - llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), - CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), /*Volatile=*/false, C.IntTy, Loc)}; unsigned Idx = 0; @@ -2301,12 +2215,12 @@ static llvm::Value *emitListToGlobalCopyFunction( const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl(); // Global = Buffer.VD[Idx]; const FieldDecl *FD = VarFieldMap.lookup(VD); + llvm::Value *BufferPtr = + Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs); LValue GlobLVal = CGF.EmitLValueForField( - CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD); Address GlobAddr = GlobLVal.getAddress(CGF); - llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(), - GlobAddr.getPointer(), Idxs); - GlobLVal.setAddress(Address(BufferPtr, + GlobLVal.setAddress(Address(GlobAddr.getPointer(), CGF.ConvertTypeForMem(Private->getType()), GlobAddr.getAlignment())); switch (CGF.getEvaluationKind(Private->getType())) { @@ -2356,13 +2270,13 @@ static llvm::Value *emitListToGlobalReduceFunction( // Buffer: global reduction buffer. ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); + C.VoidPtrTy, ImplicitParamKind::Other); // Idx: index of the buffer. ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, - ImplicitParamDecl::Other); + ImplicitParamKind::Other); // ReduceList: thread local Reduce list. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); + C.VoidPtrTy, ImplicitParamKind::Other); FunctionArgList Args; Args.push_back(&BufferArg); Args.push_back(&IdxArg); @@ -2393,8 +2307,7 @@ static llvm::Value *emitListToGlobalReduceFunction( Address ReductionList = CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); auto IPriv = Privates.begin(); - llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), - CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), /*Volatile=*/false, C.IntTy, Loc)}; unsigned Idx = 0; @@ -2403,12 +2316,13 @@ static llvm::Value *emitListToGlobalReduceFunction( // Global = Buffer.VD[Idx]; const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl(); const FieldDecl *FD = VarFieldMap.lookup(VD); + llvm::Value *BufferPtr = + Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs); LValue GlobLVal = CGF.EmitLValueForField( - CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD); Address GlobAddr = GlobLVal.getAddress(CGF); - llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( - GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); - CGF.EmitStoreOfScalar(BufferPtr, Elem, /*Volatile=*/false, C.VoidPtrTy); + CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /*Volatile=*/false, + C.VoidPtrTy); if ((*IPriv)->getType()->isVariablyModifiedType()) { // Store array size. ++Idx; @@ -2450,13 +2364,13 @@ static llvm::Value *emitGlobalToListCopyFunction( // Buffer: global reduction buffer. ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); + C.VoidPtrTy, ImplicitParamKind::Other); // Idx: index of the buffer. ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, - ImplicitParamDecl::Other); + ImplicitParamKind::Other); // ReduceList: thread local Reduce list. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); + C.VoidPtrTy, ImplicitParamKind::Other); FunctionArgList Args; Args.push_back(&BufferArg); Args.push_back(&IdxArg); @@ -2490,8 +2404,7 @@ static llvm::Value *emitGlobalToListCopyFunction( CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), LLVMReductionsBufferTy->getPointerTo()); - llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), - CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), /*Volatile=*/false, C.IntTy, Loc)}; unsigned Idx = 0; @@ -2509,12 +2422,12 @@ static llvm::Value *emitGlobalToListCopyFunction( const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl(); // Global = Buffer.VD[Idx]; const FieldDecl *FD = VarFieldMap.lookup(VD); + llvm::Value *BufferPtr = + Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs); LValue GlobLVal = CGF.EmitLValueForField( - CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD); Address GlobAddr = GlobLVal.getAddress(CGF); - llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(), - GlobAddr.getPointer(), Idxs); - GlobLVal.setAddress(Address(BufferPtr, + GlobLVal.setAddress(Address(GlobAddr.getPointer(), CGF.ConvertTypeForMem(Private->getType()), GlobAddr.getAlignment())); switch (CGF.getEvaluationKind(Private->getType())) { @@ -2564,13 +2477,13 @@ static llvm::Value *emitGlobalToListReduceFunction( // Buffer: global reduction buffer. ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); + C.VoidPtrTy, ImplicitParamKind::Other); // Idx: index of the buffer. ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, - ImplicitParamDecl::Other); + ImplicitParamKind::Other); // ReduceList: thread local Reduce list. ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamDecl::Other); + C.VoidPtrTy, ImplicitParamKind::Other); FunctionArgList Args; Args.push_back(&BufferArg); Args.push_back(&IdxArg); @@ -2601,8 +2514,7 @@ static llvm::Value *emitGlobalToListReduceFunction( Address ReductionList = CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); auto IPriv = Privates.begin(); - llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), - CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), /*Volatile=*/false, C.IntTy, Loc)}; unsigned Idx = 0; @@ -2611,12 +2523,13 @@ static llvm::Value *emitGlobalToListReduceFunction( // Global = Buffer.VD[Idx]; const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl(); const FieldDecl *FD = VarFieldMap.lookup(VD); + llvm::Value *BufferPtr = + Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs); LValue GlobLVal = CGF.EmitLValueForField( - CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD); Address GlobAddr = GlobLVal.getAddress(CGF); - llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( - GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); - CGF.EmitStoreOfScalar(BufferPtr, Elem, /*Volatile=*/false, C.VoidPtrTy); + CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /*Volatile=*/false, + C.VoidPtrTy); if ((*IPriv)->getType()->isVariablyModifiedType()) { // Store array size. ++Idx; @@ -2907,15 +2820,25 @@ void CGOpenMPRuntimeGPU::emitReduction( assert((TeamsReduction || ParallelReduction) && "Invalid reduction selection in emitReduction."); + llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; + llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); + int Cnt = 0; + for (const Expr *DRE : Privates) { + PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); + ++Cnt; + } + + ASTContext &C = CGM.getContext(); + const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars( + CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1); + // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList), // RedList, shuffle_reduce_func, interwarp_copy_func); // or // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>); llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadId = getThreadID(CGF, Loc); llvm::Value *Res; - ASTContext &C = CGM.getContext(); // 1. Build a list of reduction variables. // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; auto Size = RHSExprs.size(); @@ -2925,9 +2848,9 @@ void CGOpenMPRuntimeGPU::emitReduction( ++Size; } llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); - QualType ReductionArrayTy = - C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); + QualType ReductionArrayTy = C.getConstantArrayType( + C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal, + /*IndexTypeQuals=*/0); Address ReductionList = CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); auto IPriv = Privates.begin(); @@ -2957,19 +2880,17 @@ void CGOpenMPRuntimeGPU::emitReduction( llvm::Function *ReductionFn = emitReductionFunction( CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy), Privates, LHSExprs, RHSExprs, ReductionOps); - llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); + llvm::Value *ReductionDataSize = + CGF.getTypeSize(C.getRecordType(ReductionRec)); + ReductionDataSize = + CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty); llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction( CGM, Privates, ReductionArrayTy, ReductionFn, Loc); llvm::Value *InterWarpCopyFn = emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); if (ParallelReduction) { - llvm::Value *Args[] = {RTLoc, - ThreadId, - CGF.Builder.getInt32(RHSExprs.size()), - ReductionArrayTySize, - RL, - ShuffleAndReduceFn, + llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn, InterWarpCopyFn}; Res = CGF.EmitRuntimeCall( @@ -2978,42 +2899,27 @@ void CGOpenMPRuntimeGPU::emitReduction( Args); } else { assert(TeamsReduction && "expected teams reduction."); - llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; - llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); - int Cnt = 0; - for (const Expr *DRE : Privates) { - PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); - ++Cnt; - } - const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars( - CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, - C.getLangOpts().OpenMPCUDAReductionBufNum); - TeamsReductions.push_back(TeamReductionRec); - if (!KernelTeamsReductionPtr) { - KernelTeamsReductionPtr = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_teams_reductions_buffer_$_$ptr"); - } - llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar( - Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()), - /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); + TeamsReductions.push_back(ReductionRec); + auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer), + {}, "_openmp_teams_reductions_buffer_$_$ptr"); llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction( - CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); + CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap); llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction( - CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, + CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap, ReductionFn); llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction( - CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); + CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap); llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction( - CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, + CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap, ReductionFn); llvm::Value *Args[] = { RTLoc, - ThreadId, - GlobalBufferPtr, + KernelTeamsReductionPtr, CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum), + ReductionDataSize, RL, ShuffleAndReduceFn, InterWarpCopyFn, @@ -3055,14 +2961,7 @@ void CGOpenMPRuntimeGPU::emitReduction( ++IRHS; } }; - llvm::Value *EndArgs[] = {ThreadId}; RegionCodeGenTy RCG(CodeGen); - NVPTXActionTy Action( - nullptr, std::nullopt, - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait), - EndArgs); - RCG.setAction(Action); RCG(CGF); // There is no need to emit line number for unconditional branch. (void)ApplyDebugLocation::CreateEmpty(CGF); @@ -3092,7 +2991,7 @@ CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, if (isa<ImplicitParamDecl>(NativeParam)) return ImplicitParamDecl::Create( CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), - NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other); + NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other); return ParmVarDecl::Create( CGM.getContext(), const_cast<DeclContext *>(NativeParam->getDeclContext()), @@ -3118,11 +3017,7 @@ CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, QualType TargetTy = TargetParam->getType(); llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation()); - // First cast to generic. - TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - TargetAddr, - llvm::PointerType::get(CGF.getLLVMContext(), /*AddrSpace=*/0)); - // Cast from generic to native address space. + // Cast to native address space. TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( TargetAddr, llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace)); @@ -3149,11 +3044,8 @@ void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( TargetArgs.emplace_back(NativeArg); continue; } - llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - NativeArg, - llvm::PointerType::get(CGF.getLLVMContext(), /*AddrSpace*/ 0)); TargetArgs.emplace_back( - CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType)); + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType)); } CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); } @@ -3175,10 +3067,10 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), /*Id=*/nullptr, Int16QTy, - ImplicitParamDecl::Other); + ImplicitParamKind::Other); ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), /*Id=*/nullptr, Int32QTy, - ImplicitParamDecl::Other); + ImplicitParamKind::Other); WrapperArgs.emplace_back(&ParallelLevelArg); WrapperArgs.emplace_back(&WrapperArg); @@ -3291,7 +3183,7 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) + if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) return; assert(D && "Expected function or captured|block decl."); @@ -3343,13 +3235,13 @@ void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, Data.insert(std::make_pair(VD, MappedVarData())); } if (!NeedToDelayGlobalization) { - emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true); + emitGenericVarsProlog(CGF, D->getBeginLoc()); struct GlobalizationScope final : EHScopeStack::Cleanup { GlobalizationScope() = default; void Emit(CodeGenFunction &CGF, Flags flags) override { static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) - .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true); + .emitGenericVarsEpilog(CGF); } }; CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup); @@ -3400,7 +3292,7 @@ Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, VarTy, Align); } - if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) + if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) return Address::invalid(); VD = VD->getCanonicalDecl(); @@ -3633,6 +3525,8 @@ void CGOpenMPRuntimeGPU::processRequiresDirective( case CudaArch::GFX1103: case CudaArch::GFX1150: case CudaArch::GFX1151: + case CudaArch::GFX1200: + case CudaArch::GFX1201: case CudaArch::Generic: case CudaArch::UNUSED: case CudaArch::UNKNOWN: @@ -3645,42 +3539,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective( CGOpenMPRuntime::processRequiresDirective(D); } -void CGOpenMPRuntimeGPU::clear() { - - if (!TeamsReductions.empty()) { - ASTContext &C = CGM.getContext(); - RecordDecl *StaticRD = C.buildImplicitRecord( - "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union); - StaticRD->startDefinition(); - for (const RecordDecl *TeamReductionRec : TeamsReductions) { - QualType RecTy = C.getRecordType(TeamReductionRec); - auto *Field = FieldDecl::Create( - C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, - C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - StaticRD->addDecl(Field); - } - StaticRD->completeDefinition(); - QualType StaticTy = C.getRecordType(StaticRD); - llvm::Type *LLVMReductionsBufferTy = - CGM.getTypes().ConvertTypeForMem(StaticTy); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMReductionsBufferTy, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMReductionsBufferTy), - "_openmp_teams_reductions_buffer_$_"); - KernelTeamsReductionPtr->setInitializer( - llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, - CGM.VoidPtrTy)); - } - CGOpenMPRuntime::clear(); -} - llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; llvm::Module *M = &CGF.CGM.getModule(); |
