diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2023-12-18 20:30:12 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2024-04-19 21:12:03 +0000 |
| commit | c9157d925c489f07ba9c0b2ce47e5149b75969a5 (patch) | |
| tree | 08bc4a3d9cad3f9ebffa558ddf140b9d9257b219 /contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | |
| parent | 2a66844f606a35d68ad8a8061f4bea204274b3bc (diff) | |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 1860 |
1 files changed, 1452 insertions, 408 deletions
diff --git a/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 4c3696f9c342..ce428f78dc84 100644 --- a/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -14,6 +14,7 @@ #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" @@ -22,18 +23,27 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Frontend/Offloading/Utility.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -135,6 +145,19 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) { } #endif +static const omp::GV &getGridValue(const Triple &T, Function *Kernel) { + if (T.isAMDGPU()) { + StringRef Features = + Kernel->getFnAttribute("target-features").getValueAsString(); + if (Features.count("+wavefrontsize64")) + return omp::getAMDGPUGridValues<64>(); + return omp::getAMDGPUGridValues<32>(); + } + if (T.isNVPTX()) + return omp::NVPTXGridValues; + llvm_unreachable("No grid value available for this architecture!"); +} + /// Determine which scheduling algorithm to use, determined from schedule clause /// arguments. static OMPScheduleType @@ -331,6 +354,140 @@ BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, return splitBB(Builder, CreateBranch, Old->getName() + Suffix); } +// This function creates a fake integer value and a fake use for the integer +// value. It returns the fake value created. This is useful in modeling the +// extra arguments to the outlined functions. +Value *createFakeIntVal(IRBuilder<> &Builder, + OpenMPIRBuilder::InsertPointTy OuterAllocaIP, + std::stack<Instruction *> &ToBeDeleted, + OpenMPIRBuilder::InsertPointTy InnerAllocaIP, + const Twine &Name = "", bool AsPtr = true) { + Builder.restoreIP(OuterAllocaIP); + Instruction *FakeVal; + AllocaInst *FakeValAddr = + Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr"); + ToBeDeleted.push(FakeValAddr); + + if (AsPtr) { + FakeVal = FakeValAddr; + } else { + FakeVal = + Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val"); + ToBeDeleted.push(FakeVal); + } + + // Generate a fake use of this value + Builder.restoreIP(InnerAllocaIP); + Instruction *UseFakeVal; + if (AsPtr) { + UseFakeVal = + Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use"); + } else { + UseFakeVal = + cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10))); + } + ToBeDeleted.push(UseFakeVal); + return FakeVal; +} + +//===----------------------------------------------------------------------===// +// OpenMPIRBuilderConfig +//===----------------------------------------------------------------------===// + +namespace { +LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); +/// Values for bit flags for marking which requires clauses have been used. +enum OpenMPOffloadingRequiresDirFlags { + /// flag undefined. + OMP_REQ_UNDEFINED = 0x000, + /// no requires directive present. + OMP_REQ_NONE = 0x001, + /// reverse_offload clause. + OMP_REQ_REVERSE_OFFLOAD = 0x002, + /// unified_address clause. + OMP_REQ_UNIFIED_ADDRESS = 0x004, + /// unified_shared_memory clause. + OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008, + /// dynamic_allocators clause. + OMP_REQ_DYNAMIC_ALLOCATORS = 0x010, + LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS) +}; + +} // anonymous namespace + +OpenMPIRBuilderConfig::OpenMPIRBuilderConfig() + : RequiresFlags(OMP_REQ_UNDEFINED) {} + +OpenMPIRBuilderConfig::OpenMPIRBuilderConfig( + bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory, + bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress, + bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators) + : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU), + OpenMPOffloadMandatory(OpenMPOffloadMandatory), + RequiresFlags(OMP_REQ_UNDEFINED) { + if (HasRequiresReverseOffload) + RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD; + if (HasRequiresUnifiedAddress) + RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS; + if (HasRequiresUnifiedSharedMemory) + RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY; + if (HasRequiresDynamicAllocators) + RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS; +} + +bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const { + return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD; +} + +bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const { + return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS; +} + +bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const { + return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY; +} + +bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const { + return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS; +} + +int64_t OpenMPIRBuilderConfig::getRequiresFlags() const { + return hasRequiresFlags() ? RequiresFlags + : static_cast<int64_t>(OMP_REQ_NONE); +} + +void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) { + if (Value) + RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD; + else + RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD; +} + +void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) { + if (Value) + RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS; + else + RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS; +} + +void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) { + if (Value) + RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY; + else + RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY; +} + +void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) { + if (Value) + RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS; + else + RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS; +} + +//===----------------------------------------------------------------------===// +// OpenMPIRBuilder +//===----------------------------------------------------------------------===// + void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector<Value *> &ArgsVector) { @@ -362,7 +519,6 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { LLVMContext &Ctx = Fn.getContext(); - Triple T(M.getTargetTriple()); // Get the function's current attributes. auto Attrs = Fn.getAttributes(); @@ -383,9 +539,9 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { if (Param) { if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt)) FnAS = FnAS.addAttribute(Ctx, AK); - } else - if (auto AK = TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt)) - FnAS = FnAS.addAttribute(Ctx, AK); + } else if (auto AK = + TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt)) + FnAS = FnAS.addAttribute(Ctx, AK); } else { FnAS = FnAS.addAttributes(Ctx, AS); } @@ -399,7 +555,7 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { #define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \ case Enum: \ FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \ - addAttrSet(RetAttrs, RetAttrSet, /*Param*/false); \ + addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \ for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \ addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \ Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \ @@ -475,31 +631,7 @@ Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) { return Fn; } -void OpenMPIRBuilder::initialize(StringRef HostFilePath) { - initializeTypes(M); - - if (HostFilePath.empty()) - return; - - auto Buf = MemoryBuffer::getFile(HostFilePath); - if (std::error_code Err = Buf.getError()) { - report_fatal_error(("error opening host file from host file path inside of " - "OpenMPIRBuilder: " + - Err.message()) - .c_str()); - } - - LLVMContext Ctx; - auto M = expectedToErrorOrAndEmitErrors( - Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx)); - if (std::error_code Err = M.getError()) { - report_fatal_error( - ("error parsing host file inside of OpenMPIRBuilder: " + Err.message()) - .c_str()); - } - - loadOffloadInfoMetadata(*M.get()); -} +void OpenMPIRBuilder::initialize() { initializeTypes(M); } void OpenMPIRBuilder::finalize(Function *Fn) { SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; @@ -519,6 +651,13 @@ void OpenMPIRBuilder::finalize(Function *Fn) { Function *OuterFn = OI.getFunction(); CodeExtractorAnalysisCache CEAC(*OuterFn); + // If we generate code for the target device, we need to allocate + // struct for aggregate params in the device default alloca address space. + // OpenMP runtime requires that the params of the extracted functions are + // passed as zero address space pointers. This flag ensures that + // CodeExtractor generates correct code for extracted functions + // which are used by OpenMP runtime. + bool ArgsInZeroAddressSpace = Config.isTargetDevice(); CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, /* AggregateArgs */ true, /* BlockFrequencyInfo */ nullptr, @@ -527,7 +666,7 @@ void OpenMPIRBuilder::finalize(Function *Fn) { /* AllowVarArgs */ true, /* AllowAlloca */ true, /* AllocaBlock*/ OI.OuterAllocaBB, - /* Suffix */ ".omp_par"); + /* Suffix */ ".omp_par", ArgsInZeroAddressSpace); LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n"); LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName() @@ -572,7 +711,7 @@ void OpenMPIRBuilder::finalize(Function *Fn) { if (I.isTerminator()) continue; - I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt()); + I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt()); } OI.EntryBB->moveBefore(&ArtificialEntry); @@ -839,44 +978,6 @@ OpenMPIRBuilder::createCancel(const LocationDescription &Loc, return Builder.saveIP(); } -void OpenMPIRBuilder::emitOffloadingEntry(Constant *Addr, StringRef Name, - uint64_t Size, int32_t Flags, - StringRef SectionName) { - Type *Int8PtrTy = Type::getInt8PtrTy(M.getContext()); - Type *Int32Ty = Type::getInt32Ty(M.getContext()); - Type *SizeTy = M.getDataLayout().getIntPtrType(M.getContext()); - - Constant *AddrName = ConstantDataArray::getString(M.getContext(), Name); - - // Create the constant string used to look up the symbol in the device. - auto *Str = - new llvm::GlobalVariable(M, AddrName->getType(), /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, AddrName, - ".omp_offloading.entry_name"); - Str->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - - // Construct the offloading entry. - Constant *EntryData[] = { - ConstantExpr::getPointerBitCastOrAddrSpaceCast(Addr, Int8PtrTy), - ConstantExpr::getPointerBitCastOrAddrSpaceCast(Str, Int8PtrTy), - ConstantInt::get(SizeTy, Size), - ConstantInt::get(Int32Ty, Flags), - ConstantInt::get(Int32Ty, 0), - }; - Constant *EntryInitializer = - ConstantStruct::get(OpenMPIRBuilder::OffloadEntry, EntryData); - - auto *Entry = new GlobalVariable( - M, OpenMPIRBuilder::OffloadEntry, - /* isConstant = */ true, GlobalValue::WeakAnyLinkage, EntryInitializer, - ".omp_offloading.entry." + Name, nullptr, GlobalValue::NotThreadLocal, - M.getDataLayout().getDefaultGlobalsAddressSpace()); - - // The entry has to be created in the section the linker expects it to be. - Entry->setSection(SectionName); - Entry->setAlignment(Align(1)); -} - OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel( const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, @@ -930,7 +1031,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitKernelLaunch( (void)OutlinedFnID; // Return value of the runtime offloading call. - Value *Return; + Value *Return = nullptr; // Arguments for the target kernel. SmallVector<Value *> ArgsVector; @@ -1007,6 +1108,182 @@ void OpenMPIRBuilder::emitCancelationCheckImpl(Value *CancelFlag, Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin()); } +// Callback used to create OpenMP runtime calls to support +// omp parallel clause for the device. +// We need to use this callback to replace call to the OutlinedFn in OuterFn +// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51) +static void targetParallelCallback( + OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, + BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, + Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, + Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) { + // Add some known attributes. + IRBuilder<> &Builder = OMPIRBuilder->Builder; + OutlinedFn.addParamAttr(0, Attribute::NoAlias); + OutlinedFn.addParamAttr(1, Attribute::NoAlias); + OutlinedFn.addParamAttr(0, Attribute::NoUndef); + OutlinedFn.addParamAttr(1, Attribute::NoUndef); + OutlinedFn.addFnAttr(Attribute::NoUnwind); + + assert(OutlinedFn.arg_size() >= 2 && + "Expected at least tid and bounded tid as arguments"); + unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2; + + CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); + assert(CI && "Expected call instruction to outlined function"); + CI->getParent()->setName("omp_parallel"); + + Builder.SetInsertPoint(CI); + Type *PtrTy = OMPIRBuilder->VoidPtr; + Value *NullPtrValue = Constant::getNullValue(PtrTy); + + // Add alloca for kernel args + OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP(); + Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt()); + AllocaInst *ArgsAlloca = + Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars)); + Value *Args = ArgsAlloca; + // Add address space cast if array for storing arguments is not allocated + // in address space 0 + if (ArgsAlloca->getAddressSpace()) + Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy); + Builder.restoreIP(CurrentIP); + + // Store captured vars which are used by kmpc_parallel_51 + for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) { + Value *V = *(CI->arg_begin() + 2 + Idx); + Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64( + ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx); + Builder.CreateStore(V, StoreAddress); + } + + Value *Cond = + IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32) + : Builder.getInt32(1); + + // Build kmpc_parallel_51 call + Value *Parallel51CallArgs[] = { + /* identifier*/ Ident, + /* global thread num*/ ThreadID, + /* if expression */ Cond, + /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1), + /* Proc bind */ Builder.getInt32(-1), + /* outlined function */ + Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr), + /* wrapper function */ NullPtrValue, + /* arguments of the outlined funciton*/ Args, + /* number of arguments */ Builder.getInt64(NumCapturedVars)}; + + FunctionCallee RTLFn = + OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51); + + Builder.CreateCall(RTLFn, Parallel51CallArgs); + + LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: " + << *Builder.GetInsertBlock()->getParent() << "\n"); + + // Initialize the local TID stack location with the argument value. + Builder.SetInsertPoint(PrivTID); + Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); + Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI), + PrivTIDAddr); + + // Remove redundant call to the outlined function. + CI->eraseFromParent(); + + for (Instruction *I : ToBeDeleted) { + I->eraseFromParent(); + } +} + +// Callback used to create OpenMP runtime calls to support +// omp parallel clause for the host. +// We need to use this callback to replace call to the OutlinedFn in OuterFn +// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if]) +static void +hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, + Function *OuterFn, Value *Ident, Value *IfCondition, + Instruction *PrivTID, AllocaInst *PrivTIDAddr, + const SmallVector<Instruction *, 4> &ToBeDeleted) { + IRBuilder<> &Builder = OMPIRBuilder->Builder; + FunctionCallee RTLFn; + if (IfCondition) { + RTLFn = + OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if); + } else { + RTLFn = + OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call); + } + if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) { + if (!F->hasMetadata(LLVMContext::MD_callback)) { + LLVMContext &Ctx = F->getContext(); + MDBuilder MDB(Ctx); + // Annotate the callback behavior of the __kmpc_fork_call: + // - The callback callee is argument number 2 (microtask). + // - The first two arguments of the callback callee are unknown (-1). + // - All variadic arguments to the __kmpc_fork_call are passed to the + // callback callee. + F->addMetadata(LLVMContext::MD_callback, + *MDNode::get(Ctx, {MDB.createCallbackEncoding( + 2, {-1, -1}, + /* VarArgsArePassed */ true)})); + } + } + // Add some known attributes. + OutlinedFn.addParamAttr(0, Attribute::NoAlias); + OutlinedFn.addParamAttr(1, Attribute::NoAlias); + OutlinedFn.addFnAttr(Attribute::NoUnwind); + + assert(OutlinedFn.arg_size() >= 2 && + "Expected at least tid and bounded tid as arguments"); + unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2; + + CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); + CI->getParent()->setName("omp_parallel"); + Builder.SetInsertPoint(CI); + + // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn); + Value *ForkCallArgs[] = { + Ident, Builder.getInt32(NumCapturedVars), + Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)}; + + SmallVector<Value *, 16> RealArgs; + RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs)); + if (IfCondition) { + Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32); + RealArgs.push_back(Cond); + } + RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end()); + + // __kmpc_fork_call_if always expects a void ptr as the last argument + // If there are no arguments, pass a null pointer. + auto PtrTy = OMPIRBuilder->VoidPtr; + if (IfCondition && NumCapturedVars == 0) { + Value *NullPtrValue = Constant::getNullValue(PtrTy); + RealArgs.push_back(NullPtrValue); + } + if (IfCondition && RealArgs.back()->getType() != PtrTy) + RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy); + + Builder.CreateCall(RTLFn, RealArgs); + + LLVM_DEBUG(dbgs() << "With fork_call placed: " + << *Builder.GetInsertBlock()->getParent() << "\n"); + + // Initialize the local TID stack location with the argument value. + Builder.SetInsertPoint(PrivTID); + Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); + Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI), + PrivTIDAddr); + + // Remove redundant call to the outlined function. + CI->eraseFromParent(); + + for (Instruction *I : ToBeDeleted) { + I->eraseFromParent(); + } +} + IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( const LocationDescription &Loc, InsertPointTy OuterAllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, @@ -1021,9 +1298,16 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Value *ThreadID = getOrCreateThreadID(Ident); - - if (NumThreads) { - // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads) + // If we generate code for the target device, we need to allocate + // struct for aggregate params in the device default alloca address space. + // OpenMP runtime requires that the params of the extracted functions are + // passed as zero address space pointers. This flag ensures that extracted + // function arguments are declared in zero address space + bool ArgsInZeroAddressSpace = Config.isTargetDevice(); + + // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads) + // only if we compile for host side. + if (NumThreads && !Config.isTargetDevice()) { Value *Args[] = { Ident, ThreadID, Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)}; @@ -1054,13 +1338,28 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( // Change the location to the outer alloca insertion point to create and // initialize the allocas we pass into the parallel region. Builder.restoreIP(OuterAllocaIP); - AllocaInst *TIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr"); - AllocaInst *ZeroAddr = Builder.CreateAlloca(Int32, nullptr, "zero.addr"); + AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr"); + AllocaInst *ZeroAddrAlloca = + Builder.CreateAlloca(Int32, nullptr, "zero.addr"); + Instruction *TIDAddr = TIDAddrAlloca; + Instruction *ZeroAddr = ZeroAddrAlloca; + if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) { + // Add additional casts to enforce pointers in zero address space + TIDAddr = new AddrSpaceCastInst( + TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast"); + TIDAddr->insertAfter(TIDAddrAlloca); + ToBeDeleted.push_back(TIDAddr); + ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca, + PointerType ::get(M.getContext(), 0), + "zero.addr.ascast"); + ZeroAddr->insertAfter(ZeroAddrAlloca); + ToBeDeleted.push_back(ZeroAddr); + } // We only need TIDAddr and ZeroAddr for modeling purposes to get the // associated arguments in the outlined function, so we delete them later. - ToBeDeleted.push_back(TIDAddr); - ToBeDeleted.push_back(ZeroAddr); + ToBeDeleted.push_back(TIDAddrAlloca); + ToBeDeleted.push_back(ZeroAddrAlloca); // Create an artificial insertion point that will also ensure the blocks we // are about to split are not degenerated. @@ -1128,87 +1427,24 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( BodyGenCB(InnerAllocaIP, CodeGenIP); LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n"); - FunctionCallee RTLFn; - if (IfCondition) - RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if); - else - RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call); - - if (auto *F = dyn_cast<llvm::Function>(RTLFn.getCallee())) { - if (!F->hasMetadata(llvm::LLVMContext::MD_callback)) { - llvm::LLVMContext &Ctx = F->getContext(); - MDBuilder MDB(Ctx); - // Annotate the callback behavior of the __kmpc_fork_call: - // - The callback callee is argument number 2 (microtask). - // - The first two arguments of the callback callee are unknown (-1). - // - All variadic arguments to the __kmpc_fork_call are passed to the - // callback callee. - F->addMetadata( - llvm::LLVMContext::MD_callback, - *llvm::MDNode::get( - Ctx, {MDB.createCallbackEncoding(2, {-1, -1}, - /* VarArgsArePassed */ true)})); - } - } OutlineInfo OI; - OI.PostOutlineCB = [=](Function &OutlinedFn) { - // Add some known attributes. - OutlinedFn.addParamAttr(0, Attribute::NoAlias); - OutlinedFn.addParamAttr(1, Attribute::NoAlias); - OutlinedFn.addFnAttr(Attribute::NoUnwind); - OutlinedFn.addFnAttr(Attribute::NoRecurse); - - assert(OutlinedFn.arg_size() >= 2 && - "Expected at least tid and bounded tid as arguments"); - unsigned NumCapturedVars = - OutlinedFn.arg_size() - /* tid & bounded tid */ 2; - - CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); - CI->getParent()->setName("omp_parallel"); - Builder.SetInsertPoint(CI); - - // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn); - Value *ForkCallArgs[] = { - Ident, Builder.getInt32(NumCapturedVars), - Builder.CreateBitCast(&OutlinedFn, ParallelTaskPtr)}; - - SmallVector<Value *, 16> RealArgs; - RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs)); - if (IfCondition) { - Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, - Type::getInt32Ty(M.getContext())); - RealArgs.push_back(Cond); - } - RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end()); - - // __kmpc_fork_call_if always expects a void ptr as the last argument - // If there are no arguments, pass a null pointer. - auto PtrTy = Type::getInt8PtrTy(M.getContext()); - if (IfCondition && NumCapturedVars == 0) { - llvm::Value *Void = ConstantPointerNull::get(PtrTy); - RealArgs.push_back(Void); - } - if (IfCondition && RealArgs.back()->getType() != PtrTy) - RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy); - - Builder.CreateCall(RTLFn, RealArgs); - - LLVM_DEBUG(dbgs() << "With fork_call placed: " - << *Builder.GetInsertBlock()->getParent() << "\n"); - - InsertPointTy ExitIP(PRegExitBB, PRegExitBB->end()); - - // Initialize the local TID stack location with the argument value. - Builder.SetInsertPoint(PrivTID); - Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); - Builder.CreateStore(Builder.CreateLoad(Int32, OutlinedAI), PrivTIDAddr); - - CI->eraseFromParent(); - - for (Instruction *I : ToBeDeleted) - I->eraseFromParent(); - }; + if (Config.isTargetDevice()) { + // Generate OpenMP target specific runtime call + OI.PostOutlineCB = [=, ToBeDeletedVec = + std::move(ToBeDeleted)](Function &OutlinedFn) { + targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident, + IfCondition, NumThreads, PrivTID, PrivTIDAddr, + ThreadID, ToBeDeletedVec); + }; + } else { + // Generate OpenMP host runtime call + OI.PostOutlineCB = [=, ToBeDeletedVec = + std::move(ToBeDeleted)](Function &OutlinedFn) { + hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition, + PrivTID, PrivTIDAddr, ToBeDeletedVec); + }; + } // Adjust the finalization stack, verify the adjustment, and call the // finalize function a last time to finalize values between the pre-fini @@ -1248,7 +1484,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( /* AllowVarArgs */ true, /* AllowAlloca */ true, /* AllocationBlock */ OuterAllocaBlock, - /* Suffix */ ".omp_par"); + /* Suffix */ ".omp_par", ArgsInZeroAddressSpace); // Find inputs to, outputs from the code region. BasicBlock *CommonExit = nullptr; @@ -1413,6 +1649,7 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition, SmallVector<DependData> Dependencies) { + if (!updateToLocation(Loc)) return InsertPointTy(); @@ -1440,41 +1677,31 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, BasicBlock *TaskAllocaBB = splitBB(Builder, /*CreateBranch=*/true, "task.alloca"); + InsertPointTy TaskAllocaIP = + InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin()); + InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin()); + BodyGenCB(TaskAllocaIP, TaskBodyIP); + OutlineInfo OI; OI.EntryBB = TaskAllocaBB; OI.OuterAllocaBB = AllocaIP.getBlock(); OI.ExitBB = TaskExitBB; - OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, - Dependencies](Function &OutlinedFn) { - // The input IR here looks like the following- - // ``` - // func @current_fn() { - // outlined_fn(%args) - // } - // func @outlined_fn(%args) { ... } - // ``` - // - // This is changed to the following- - // - // ``` - // func @current_fn() { - // runtime_call(..., wrapper_fn, ...) - // } - // func @wrapper_fn(..., %args) { - // outlined_fn(%args) - // } - // func @outlined_fn(%args) { ... } - // ``` - // The stale call instruction will be replaced with a new call instruction - // for runtime call with a wrapper function. + // Add the thread ID argument. + std::stack<Instruction *> ToBeDeleted; + OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( + Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false)); + + OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies, + TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable { + // Replace the Stale CI by appropriate RTL function call. assert(OutlinedFn.getNumUses() == 1 && "there must be a single user for the outlined function"); CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back()); - // HasTaskData is true if any variables are captured in the outlined region, + // HasShareds is true if any variables are captured in the outlined region, // false otherwise. - bool HasTaskData = StaleCI->arg_size() > 0; + bool HasShareds = StaleCI->arg_size() > 1; Builder.SetInsertPoint(StaleCI); // Gather the arguments for emitting the runtime call for @@ -1502,10 +1729,17 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, // Argument - `sizeof_kmp_task_t` (TaskSize) // Tasksize refers to the size in bytes of kmp_task_t data structure // including private vars accessed in task. - Value *TaskSize = Builder.getInt64(0); - if (HasTaskData) { + // TODO: add kmp_task_t_with_privates (privates) + Value *TaskSize = Builder.getInt64( + divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8)); + + // Argument - `sizeof_shareds` (SharedsSize) + // SharedsSize refers to the shareds array size in the kmp_task_t data + // structure. + Value *SharedsSize = Builder.getInt64(0); + if (HasShareds) { AllocaInst *ArgStructAlloca = - dyn_cast<AllocaInst>(StaleCI->getArgOperand(0)); + dyn_cast<AllocaInst>(StaleCI->getArgOperand(1)); assert(ArgStructAlloca && "Unable to find the alloca instruction corresponding to arguments " "for extracted function"); @@ -1513,51 +1747,34 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, dyn_cast<StructType>(ArgStructAlloca->getAllocatedType()); assert(ArgStructType && "Unable to find struct type corresponding to " "arguments for extracted function"); - TaskSize = + SharedsSize = Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); } - - // TODO: Argument - sizeof_shareds - - // Argument - task_entry (the wrapper function) - // If the outlined function has some captured variables (i.e. HasTaskData is - // true), then the wrapper function will have an additional argument (the - // struct containing captured variables). Otherwise, no such argument will - // be present. - SmallVector<Type *> WrapperArgTys{Builder.getInt32Ty()}; - if (HasTaskData) - WrapperArgTys.push_back(OutlinedFn.getArg(0)->getType()); - FunctionCallee WrapperFuncVal = M.getOrInsertFunction( - (Twine(OutlinedFn.getName()) + ".wrapper").str(), - FunctionType::get(Builder.getInt32Ty(), WrapperArgTys, false)); - Function *WrapperFunc = dyn_cast<Function>(WrapperFuncVal.getCallee()); - // Emit the @__kmpc_omp_task_alloc runtime call // The runtime call returns a pointer to an area where the task captured - // variables must be copied before the task is run (NewTaskData) - CallInst *NewTaskData = Builder.CreateCall( - TaskAllocFn, - {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags, - /*sizeof_task=*/TaskSize, /*sizeof_shared=*/Builder.getInt64(0), - /*task_func=*/WrapperFunc}); + // variables must be copied before the task is run (TaskData) + CallInst *TaskData = Builder.CreateCall( + TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags, + /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize, + /*task_func=*/&OutlinedFn}); // Copy the arguments for outlined function - if (HasTaskData) { - Value *TaskData = StaleCI->getArgOperand(0); + if (HasShareds) { + Value *Shareds = StaleCI->getArgOperand(1); Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); - Builder.CreateMemCpy(NewTaskData, Alignment, TaskData, Alignment, - TaskSize); + Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData); + Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment, + SharedsSize); } - Value *DepArrayPtr = nullptr; + Value *DepArray = nullptr; if (Dependencies.size()) { InsertPointTy OldIP = Builder.saveIP(); Builder.SetInsertPoint( &OldIP.getBlock()->getParent()->getEntryBlock().back()); Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size()); - Value *DepArray = - Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr"); + DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr"); unsigned P = 0; for (const DependData &Dep : Dependencies) { @@ -1588,7 +1805,6 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, ++P; } - DepArrayPtr = Builder.CreateBitCast(DepArray, Builder.getInt8PtrTy()); Builder.restoreIP(OldIP); } @@ -1601,7 +1817,7 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, // br label %exit // else: // call @__kmpc_omp_task_begin_if0(...) - // call @wrapper_fn(...) + // call @outlined_fn(...) // call @__kmpc_omp_task_complete_if0(...) // br label %exit // exit: @@ -1609,10 +1825,9 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, if (IfCondition) { // `SplitBlockAndInsertIfThenElse` requires the block to have a // terminator. - BasicBlock *NewBasicBlock = - splitBB(Builder, /*CreateBranch=*/true, "if.end"); + splitBB(Builder, /*CreateBranch=*/true, "if.end"); Instruction *IfTerminator = - NewBasicBlock->getSinglePredecessor()->getTerminator(); + Builder.GetInsertPoint()->getParent()->getTerminator(); Instruction *ThenTI = IfTerminator, *ElseTI = nullptr; Builder.SetInsertPoint(IfTerminator); SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI, @@ -1622,12 +1837,14 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0); Function *TaskCompleteFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0); - Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, NewTaskData}); - if (HasTaskData) - Builder.CreateCall(WrapperFunc, {ThreadID, NewTaskData}); + Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData}); + CallInst *CI = nullptr; + if (HasShareds) + CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData}); else - Builder.CreateCall(WrapperFunc, {ThreadID}); - Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, NewTaskData}); + CI = Builder.CreateCall(&OutlinedFn, {ThreadID}); + CI->setDebugLoc(StaleCI->getDebugLoc()); + Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData}); Builder.SetInsertPoint(ThenTI); } @@ -1636,35 +1853,32 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc, getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps); Builder.CreateCall( TaskFn, - {Ident, ThreadID, NewTaskData, Builder.getInt32(Dependencies.size()), - DepArrayPtr, ConstantInt::get(Builder.getInt32Ty(), 0), - ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))}); + {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()), + DepArray, ConstantInt::get(Builder.getInt32Ty(), 0), + ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))}); } else { // Emit the @__kmpc_omp_task runtime call to spawn the task Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task); - Builder.CreateCall(TaskFn, {Ident, ThreadID, NewTaskData}); + Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData}); } StaleCI->eraseFromParent(); - // Emit the body for wrapper function - BasicBlock *WrapperEntryBB = - BasicBlock::Create(M.getContext(), "", WrapperFunc); - Builder.SetInsertPoint(WrapperEntryBB); - if (HasTaskData) - Builder.CreateCall(&OutlinedFn, {WrapperFunc->getArg(1)}); - else - Builder.CreateCall(&OutlinedFn); - Builder.CreateRet(Builder.getInt32(0)); + Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin()); + if (HasShareds) { + LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1)); + OutlinedFn.getArg(1)->replaceUsesWithIf( + Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; }); + } + + while (!ToBeDeleted.empty()) { + ToBeDeleted.top()->eraseFromParent(); + ToBeDeleted.pop(); + } }; addOutlineInfo(std::move(OI)); - - InsertPointTy TaskAllocaIP = - InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin()); - InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin()); - BodyGenCB(TaskAllocaIP, TaskBodyIP); Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin()); return Builder.saveIP(); @@ -1832,7 +2046,7 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc, /// the given module and return it. Function *getFreshReductionFunc(Module &M) { Type *VoidTy = Type::getVoidTy(M.getContext()); - Type *Int8PtrTy = Type::getInt8PtrTy(M.getContext()); + Type *Int8PtrTy = PointerType::getUnqual(M.getContext()); auto *FuncTy = FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false); return Function::Create(FuncTy, GlobalVariable::InternalLinkage, @@ -1866,7 +2080,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( // Create and populate array of type-erased pointers to private reduction // values. unsigned NumReductions = ReductionInfos.size(); - Type *RedArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumReductions); + Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions); Builder.restoreIP(AllocaIP); Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array"); @@ -1877,18 +2091,13 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( const ReductionInfo &RI = En.value(); Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64( RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index)); - Value *Casted = - Builder.CreateBitCast(RI.PrivateVariable, Builder.getInt8PtrTy(), - "private.red.var." + Twine(Index) + ".casted"); - Builder.CreateStore(Casted, RedArrayElemPtr); + Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr); } // Emit a call to the runtime function that orchestrates the reduction. // Declare the reduction function in the process. Function *Func = Builder.GetInsertBlock()->getParent(); Module *Module = Func->getParent(); - Value *RedArrayPtr = - Builder.CreateBitCast(RedArray, Builder.getInt8PtrTy(), "red.array.ptr"); uint32_t SrcLocStrSize; Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); bool CanGenerateAtomic = @@ -1911,8 +2120,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( : RuntimeFunction::OMPRTL___kmpc_reduce); CallInst *ReduceCall = Builder.CreateCall(ReduceFunc, - {Ident, ThreadId, NumVariables, RedArraySize, - RedArrayPtr, ReductionFunc, Lock}, + {Ident, ThreadId, NumVariables, RedArraySize, RedArray, + ReductionFunc, Lock}, "reduce"); // Create final reduction entry blocks for the atomic and non-atomic case. @@ -1981,12 +2190,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( const ReductionInfo &RI = En.value(); Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( RedArrayTy, LHSArrayPtr, 0, En.index()); - Value *LHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), LHSI8PtrPtr); + Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr); Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType()); Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( RedArrayTy, RHSArrayPtr, 0, En.index()); - Value *RHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), RHSI8PtrPtr); + Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr); Value *RHSPtr = Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType()); Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); @@ -2465,11 +2674,242 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop( return {DispatchAfter, DispatchAfter->getFirstInsertionPt()}; } +// Returns an LLVM function to call for executing an OpenMP static worksharing +// for loop depending on `type`. Only i32 and i64 are supported by the runtime. +// Always interpret integers as unsigned similarly to CanonicalLoopInfo. +static FunctionCallee +getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, + WorksharingLoopType LoopType) { + unsigned Bitwidth = Ty->getIntegerBitWidth(); + Module &M = OMPBuilder->M; + switch (LoopType) { + case WorksharingLoopType::ForStaticLoop: + if (Bitwidth == 32) + return OMPBuilder->getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u); + if (Bitwidth == 64) + return OMPBuilder->getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u); + break; + case WorksharingLoopType::DistributeStaticLoop: + if (Bitwidth == 32) + return OMPBuilder->getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u); + if (Bitwidth == 64) + return OMPBuilder->getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u); + break; + case WorksharingLoopType::DistributeForStaticLoop: + if (Bitwidth == 32) + return OMPBuilder->getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u); + if (Bitwidth == 64) + return OMPBuilder->getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u); + break; + } + if (Bitwidth != 32 && Bitwidth != 64) { + llvm_unreachable("Unknown OpenMP loop iterator bitwidth"); + } + llvm_unreachable("Unknown type of OpenMP worksharing loop"); +} + +// Inserts a call to proper OpenMP Device RTL function which handles +// loop worksharing. +static void createTargetLoopWorkshareCall( + OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, + BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, + Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) { + Type *TripCountTy = TripCount->getType(); + Module &M = OMPBuilder->M; + IRBuilder<> &Builder = OMPBuilder->Builder; + FunctionCallee RTLFn = + getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType); + SmallVector<Value *, 8> RealArgs; + RealArgs.push_back(Ident); + RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr)); + RealArgs.push_back(LoopBodyArg); + RealArgs.push_back(TripCount); + if (LoopType == WorksharingLoopType::DistributeStaticLoop) { + RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); + Builder.CreateCall(RTLFn, RealArgs); + return; + } + FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads); + Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())}); + Value *NumThreads = Builder.CreateCall(RTLNumThreads, {}); + + RealArgs.push_back( + Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast")); + RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); + if (LoopType == WorksharingLoopType::DistributeForStaticLoop) { + RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); + } + + Builder.CreateCall(RTLFn, RealArgs); +} + +static void +workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, + CanonicalLoopInfo *CLI, Value *Ident, + Function &OutlinedFn, Type *ParallelTaskPtr, + const SmallVector<Instruction *, 4> &ToBeDeleted, + WorksharingLoopType LoopType) { + IRBuilder<> &Builder = OMPIRBuilder->Builder; + BasicBlock *Preheader = CLI->getPreheader(); + Value *TripCount = CLI->getTripCount(); + + // After loop body outling, the loop body contains only set up + // of loop body argument structure and the call to the outlined + // loop body function. Firstly, we need to move setup of loop body args + // into loop preheader. + Preheader->splice(std::prev(Preheader->end()), CLI->getBody(), + CLI->getBody()->begin(), std::prev(CLI->getBody()->end())); + + // The next step is to remove the whole loop. We do not it need anymore. + // That's why make an unconditional branch from loop preheader to loop + // exit block + Builder.restoreIP({Preheader, Preheader->end()}); + Preheader->getTerminator()->eraseFromParent(); + Builder.CreateBr(CLI->getExit()); + + // Delete dead loop blocks + OpenMPIRBuilder::OutlineInfo CleanUpInfo; + SmallPtrSet<BasicBlock *, 32> RegionBlockSet; + SmallVector<BasicBlock *, 32> BlocksToBeRemoved; + CleanUpInfo.EntryBB = CLI->getHeader(); + CleanUpInfo.ExitBB = CLI->getExit(); + CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved); + DeleteDeadBlocks(BlocksToBeRemoved); + + // Find the instruction which corresponds to loop body argument structure + // and remove the call to loop body function instruction. + Value *LoopBodyArg; + User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser(); + assert(OutlinedFnUser && + "Expected unique undroppable user of outlined function"); + CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser); + assert(OutlinedFnCallInstruction && "Expected outlined function call"); + assert((OutlinedFnCallInstruction->getParent() == Preheader) && + "Expected outlined function call to be located in loop preheader"); + // Check in case no argument structure has been passed. + if (OutlinedFnCallInstruction->arg_size() > 1) + LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1); + else + LoopBodyArg = Constant::getNullValue(Builder.getPtrTy()); + OutlinedFnCallInstruction->eraseFromParent(); + + createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident, + LoopBodyArg, ParallelTaskPtr, TripCount, + OutlinedFn); + + for (auto &ToBeDeletedItem : ToBeDeleted) + ToBeDeletedItem->eraseFromParent(); + CLI->invalidate(); +} + +OpenMPIRBuilder::InsertPointTy +OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, + WorksharingLoopType LoopType) { + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); + + OutlineInfo OI; + OI.OuterAllocaBB = CLI->getPreheader(); + Function *OuterFn = CLI->getPreheader()->getParent(); + + // Instructions which need to be deleted at the end of code generation + SmallVector<Instruction *, 4> ToBeDeleted; + + OI.OuterAllocaBB = AllocaIP.getBlock(); + + // Mark the body loop as region which needs to be extracted + OI.EntryBB = CLI->getBody(); + OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(), + "omp.prelatch", true); + + // Prepare loop body for extraction + Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()}); + + // Insert new loop counter variable which will be used only in loop + // body. + AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, ""); + Instruction *NewLoopCntLoad = + Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt); + // New loop counter instructions are redundant in the loop preheader when + // code generation for workshare loop is finshed. That's why mark them as + // ready for deletion. + ToBeDeleted.push_back(NewLoopCntLoad); + ToBeDeleted.push_back(NewLoopCnt); + + // Analyse loop body region. Find all input variables which are used inside + // loop body region. + SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; + SmallVector<BasicBlock *, 32> Blocks; + OI.collectBlocks(ParallelRegionBlockSet, Blocks); + SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(), + ParallelRegionBlockSet.end()); + + CodeExtractorAnalysisCache CEAC(*OuterFn); + CodeExtractor Extractor(Blocks, + /* DominatorTree */ nullptr, + /* AggregateArgs */ true, + /* BlockFrequencyInfo */ nullptr, + /* BranchProbabilityInfo */ nullptr, + /* AssumptionCache */ nullptr, + /* AllowVarArgs */ true, + /* AllowAlloca */ true, + /* AllocationBlock */ CLI->getPreheader(), + /* Suffix */ ".omp_wsloop", + /* AggrArgsIn0AddrSpace */ true); + + BasicBlock *CommonExit = nullptr; + SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands; + + // Find allocas outside the loop body region which are used inside loop + // body + Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); + + // We need to model loop body region as the function f(cnt, loop_arg). + // That's why we replace loop induction variable by the new counter + // which will be one of loop body function argument + for (auto Use = CLI->getIndVar()->user_begin(); + Use != CLI->getIndVar()->user_end(); ++Use) { + if (Instruction *Inst = dyn_cast<Instruction>(*Use)) { + if (ParallelRegionBlockSet.count(Inst->getParent())) { + Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad); + } + } + } + // Make sure that loop counter variable is not merged into loop body + // function argument structure and it is passed as separate variable + OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad); + + // PostOutline CB is invoked when loop body function is outlined and + // loop body is replaced by call to outlined function. We need to add + // call to OpenMP device rtl inside loop preheader. OpenMP device rtl + // function will handle loop control logic. + // + OI.PostOutlineCB = [=, ToBeDeletedVec = + std::move(ToBeDeleted)](Function &OutlinedFn) { + workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr, + ToBeDeletedVec, LoopType); + }; + addOutlineInfo(std::move(OI)); + return CLI->getAfterIP(); +} + OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop( DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, - bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind, - llvm::Value *ChunkSize, bool HasSimdModifier, bool HasMonotonicModifier, - bool HasNonmonotonicModifier, bool HasOrderedClause) { + bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize, + bool HasSimdModifier, bool HasMonotonicModifier, + bool HasNonmonotonicModifier, bool HasOrderedClause, + WorksharingLoopType LoopType) { + if (Config.isTargetDevice()) + return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType); OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType( SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier, HasNonmonotonicModifier, HasOrderedClause); @@ -3311,7 +3751,7 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, /// "target-features" that determine the TargetMachine are per-function and can /// be overrided using __attribute__((target("OPTIONS"))). static std::unique_ptr<TargetMachine> -createTargetMachine(Function *F, CodeGenOpt::Level OptLevel) { +createTargetMachine(Function *F, CodeGenOptLevel OptLevel) { Module *M = F->getParent(); StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString(); @@ -3337,7 +3777,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { // Assume the user requests the most aggressive unrolling, even if the rest of // the code is optimized using a lower setting. - CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive; + CodeGenOptLevel OptLevel = CodeGenOptLevel::Aggressive; std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel); FunctionAnalysisManager FAM; @@ -3370,7 +3810,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(L, SE, TTI, /*BlockFrequencyInfo=*/nullptr, - /*ProfileSummaryInfo=*/nullptr, ORE, OptLevel, + /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel), /*UserThreshold=*/std::nullopt, /*UserCount=*/std::nullopt, /*UserAllowPartial=*/true, @@ -3429,20 +3869,16 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { } } - unsigned NumInlineCandidates; - bool NotDuplicatable; - bool Convergent; - InstructionCost LoopSizeIC = - ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, - TTI, EphValues, UP.BEInsns); - LLVM_DEBUG(dbgs() << "Estimated loop size is " << LoopSizeIC << "\n"); + UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns); // Loop is not unrollable if the loop contains certain instructions. - if (NotDuplicatable || Convergent || !LoopSizeIC.isValid()) { + if (!UCE.canUnroll() || UCE.Convergent) { LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n"); return 1; } - unsigned LoopSize = *LoopSizeIC.getValue(); + + LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize() + << "\n"); // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might // be able to use it. @@ -3453,7 +3889,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { bool UseUpperBound = false; computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount, - MaxTripCount, MaxOrZero, TripMultiple, LoopSize, UP, PP, + MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP, UseUpperBound); unsigned Factor = UP.Count; LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n"); @@ -3917,7 +4353,7 @@ CallInst *OpenMPIRBuilder::createOMPInteropInit( Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType); if (NumDependences == nullptr) { NumDependences = ConstantInt::get(Int32, 0); - PointerType *PointerTypeVar = Type::getInt8PtrTy(M.getContext()); + PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext()); DependenceAddress = ConstantPointerNull::get(PointerTypeVar); } Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); @@ -3944,7 +4380,7 @@ CallInst *OpenMPIRBuilder::createOMPInteropDestroy( Device = ConstantInt::get(Int32, -1); if (NumDependences == nullptr) { NumDependences = ConstantInt::get(Int32, 0); - PointerType *PointerTypeVar = Type::getInt8PtrTy(M.getContext()); + PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext()); DependenceAddress = ConstantPointerNull::get(PointerTypeVar); } Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); @@ -3972,7 +4408,7 @@ CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc, Device = ConstantInt::get(Int32, -1); if (NumDependences == nullptr) { NumDependences = ConstantInt::get(Int32, 0); - PointerType *PointerTypeVar = Type::getInt8PtrTy(M.getContext()); + PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext()); DependenceAddress = ConstantPointerNull::get(PointerTypeVar); } Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); @@ -4006,24 +4442,103 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate( } OpenMPIRBuilder::InsertPointTy -OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) { +OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, + int32_t MinThreadsVal, int32_t MaxThreadsVal, + int32_t MinTeamsVal, int32_t MaxTeamsVal) { if (!updateToLocation(Loc)) return Loc.IP; uint32_t SrcLocStrSize; Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); - ConstantInt *IsSPMDVal = ConstantInt::getSigned( - IntegerType::getInt8Ty(Int8->getContext()), - IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); - ConstantInt *UseGenericStateMachine = - ConstantInt::getBool(Int32->getContext(), !IsSPMD); + Constant *IsSPMDVal = ConstantInt::getSigned( + Int8, IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); + Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD); + Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true); + Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0); + + Function *Kernel = Builder.GetInsertBlock()->getParent(); + + // Manifest the launch configuration in the metadata matching the kernel + // environment. + if (MinTeamsVal > 1 || MaxTeamsVal > 0) + writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal); + + // For max values, < 0 means unset, == 0 means set but unknown. + if (MaxThreadsVal < 0) + MaxThreadsVal = std::max( + int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal); + + if (MaxThreadsVal > 0) + writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal); + + Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal); + Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal); + Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal); + Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal); + Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0); + Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0); + + // We need to strip the debug prefix to get the correct kernel name. + StringRef KernelName = Kernel->getName(); + const std::string DebugPrefix = "_debug__"; + if (KernelName.ends_with(DebugPrefix)) + KernelName = KernelName.drop_back(DebugPrefix.length()); Function *Fn = getOrCreateRuntimeFunctionPtr( omp::RuntimeFunction::OMPRTL___kmpc_target_init); - - CallInst *ThreadKind = Builder.CreateCall( - Fn, {Ident, IsSPMDVal, UseGenericStateMachine}); + const DataLayout &DL = Fn->getParent()->getDataLayout(); + + Twine DynamicEnvironmentName = KernelName + "_dynamic_environment"; + Constant *DynamicEnvironmentInitializer = + ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal}); + GlobalVariable *DynamicEnvironmentGV = new GlobalVariable( + M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage, + DynamicEnvironmentInitializer, DynamicEnvironmentName, + /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, + DL.getDefaultGlobalsAddressSpace()); + DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility); + + Constant *DynamicEnvironment = + DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr + ? DynamicEnvironmentGV + : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV, + DynamicEnvironmentPtr); + + Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get( + ConfigurationEnvironment, { + UseGenericStateMachineVal, + MayUseNestedParallelismVal, + IsSPMDVal, + MinThreads, + MaxThreads, + MinTeams, + MaxTeams, + ReductionDataSize, + ReductionBufferLength, + }); + Constant *KernelEnvironmentInitializer = ConstantStruct::get( + KernelEnvironment, { + ConfigurationEnvironmentInitializer, + Ident, + DynamicEnvironment, + }); + Twine KernelEnvironmentName = KernelName + "_kernel_environment"; + GlobalVariable *KernelEnvironmentGV = new GlobalVariable( + M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage, + KernelEnvironmentInitializer, KernelEnvironmentName, + /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, + DL.getDefaultGlobalsAddressSpace()); + KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility); + + Constant *KernelEnvironment = + KernelEnvironmentGV->getType() == KernelEnvironmentPtr + ? KernelEnvironmentGV + : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV, + KernelEnvironmentPtr); + Value *KernelLaunchEnvironment = Kernel->getArg(0); + CallInst *ThreadKind = + Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment}); Value *ExecUserCode = Builder.CreateICmpEQ( ThreadKind, ConstantInt::get(ThreadKind->getType(), -1), @@ -4057,46 +4572,153 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) { } void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc, - bool IsSPMD) { + int32_t TeamsReductionDataSize, + int32_t TeamsReductionBufferLength) { if (!updateToLocation(Loc)) return; - uint32_t SrcLocStrSize; - Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); - Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); - ConstantInt *IsSPMDVal = ConstantInt::getSigned( - IntegerType::getInt8Ty(Int8->getContext()), - IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); - Function *Fn = getOrCreateRuntimeFunctionPtr( omp::RuntimeFunction::OMPRTL___kmpc_target_deinit); - Builder.CreateCall(Fn, {Ident, IsSPMDVal}); + Builder.CreateCall(Fn, {}); + + if (!TeamsReductionBufferLength || !TeamsReductionDataSize) + return; + + Function *Kernel = Builder.GetInsertBlock()->getParent(); + // We need to strip the debug prefix to get the correct kernel name. + StringRef KernelName = Kernel->getName(); + const std::string DebugPrefix = "_debug__"; + if (KernelName.ends_with(DebugPrefix)) + KernelName = KernelName.drop_back(DebugPrefix.length()); + auto *KernelEnvironmentGV = + M.getNamedGlobal((KernelName + "_kernel_environment").str()); + assert(KernelEnvironmentGV && "Expected kernel environment global\n"); + auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer(); + auto *NewInitializer = ConstantFoldInsertValueInstruction( + KernelEnvironmentInitializer, + ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7}); + NewInitializer = ConstantFoldInsertValueInstruction( + NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength), + {0, 8}); + KernelEnvironmentGV->setInitializer(NewInitializer); +} + +static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) { + Module &M = *Kernel.getParent(); + NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); + for (auto *Op : MD->operands()) { + if (Op->getNumOperands() != 3) + continue; + auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0)); + if (!KernelOp || KernelOp->getValue() != &Kernel) + continue; + auto *Prop = dyn_cast<MDString>(Op->getOperand(1)); + if (!Prop || Prop->getString() != Name) + continue; + return Op; + } + return nullptr; +} + +static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, + bool Min) { + // Update the "maxntidx" metadata for NVIDIA, or add it. + MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name); + if (ExistingOp) { + auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2)); + int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue(); + ExistingOp->replaceOperandWith( + 2, ConstantAsMetadata::get(ConstantInt::get( + OldVal->getValue()->getType(), + Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value)))); + } else { + LLVMContext &Ctx = Kernel.getContext(); + Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel), + MDString::get(Ctx, Name), + ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Ctx), Value))}; + // Append metadata to nvvm.annotations + Module &M = *Kernel.getParent(); + NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); + MD->addOperand(MDNode::get(Ctx, MDVals)); + } +} + +std::pair<int32_t, int32_t> +OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) { + int32_t ThreadLimit = + Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit"); + + if (T.isAMDGPU()) { + const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size"); + if (!Attr.isValid() || !Attr.isStringAttribute()) + return {0, ThreadLimit}; + auto [LBStr, UBStr] = Attr.getValueAsString().split(','); + int32_t LB, UB; + if (!llvm::to_integer(UBStr, UB, 10)) + return {0, ThreadLimit}; + UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB; + if (!llvm::to_integer(LBStr, LB, 10)) + return {0, UB}; + return {LB, UB}; + } + + if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) { + auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2)); + int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue(); + return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB}; + } + return {0, ThreadLimit}; +} + +void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T, + Function &Kernel, int32_t LB, + int32_t UB) { + Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB)); + + if (T.isAMDGPU()) { + Kernel.addFnAttr("amdgpu-flat-work-group-size", + llvm::utostr(LB) + "," + llvm::utostr(UB)); + return; + } + + updateNVPTXMetadata(Kernel, "maxntidx", UB, true); +} + +std::pair<int32_t, int32_t> +OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) { + // TODO: Read from backend annotations if available. + return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")}; +} + +void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel, + int32_t LB, int32_t UB) { + if (T.isNVPTX()) { + if (UB > 0) + updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true); + updateNVPTXMetadata(Kernel, "minctasm", LB, false); + } + Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB)); } void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes( - Function *OutlinedFn, int32_t NumTeams, int32_t NumThreads) { + Function *OutlinedFn) { if (Config.isTargetDevice()) { OutlinedFn->setLinkage(GlobalValue::WeakODRLinkage); // TODO: Determine if DSO local can be set to true. OutlinedFn->setDSOLocal(false); OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility); - if (Triple(M.getTargetTriple()).isAMDGCN()) + if (T.isAMDGCN()) OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL); } - - if (NumTeams > 0) - OutlinedFn->addFnAttr("omp_target_num_teams", std::to_string(NumTeams)); - if (NumThreads > 0) - OutlinedFn->addFnAttr("omp_target_thread_limit", - std::to_string(NumThreads)); } Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn, StringRef EntryFnIDName) { if (Config.isTargetDevice()) { assert(OutlinedFn && "The outlined function must exist if embedded"); - return ConstantExpr::getBitCast(OutlinedFn, Builder.getInt8PtrTy()); + return OutlinedFn; } return new GlobalVariable( @@ -4118,9 +4740,8 @@ Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn, void OpenMPIRBuilder::emitTargetRegionFunction( TargetRegionEntryInfo &EntryInfo, - FunctionGenCallback &GenerateFunctionCallback, int32_t NumTeams, - int32_t NumThreads, bool IsOffloadEntry, Function *&OutlinedFn, - Constant *&OutlinedFnID) { + FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, + Function *&OutlinedFn, Constant *&OutlinedFnID) { SmallString<64> EntryFnName; OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo); @@ -4140,16 +4761,15 @@ void OpenMPIRBuilder::emitTargetRegionFunction( ? std::string(EntryFnName) : createPlatformSpecificName({EntryFnName, "region_id"}); - OutlinedFnID = registerTargetRegionFunction( - EntryInfo, OutlinedFn, EntryFnName, EntryFnIDName, NumTeams, NumThreads); + OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn, + EntryFnName, EntryFnIDName); } Constant *OpenMPIRBuilder::registerTargetRegionFunction( TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn, - StringRef EntryFnName, StringRef EntryFnIDName, int32_t NumTeams, - int32_t NumThreads) { + StringRef EntryFnName, StringRef EntryFnIDName) { if (OutlinedFn) - setOutlinedTargetRegionFunctionAttributes(OutlinedFn, NumTeams, NumThreads); + setOutlinedTargetRegionFunctionAttributes(OutlinedFn); auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName); auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName); OffloadInfoManager.registerTargetRegionEntryInfo( @@ -4161,8 +4781,7 @@ Constant *OpenMPIRBuilder::registerTargetRegionFunction( OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, - TargetDataInfo &Info, - function_ref<MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCB, + TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc, function_ref<InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB, @@ -4171,6 +4790,10 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData( if (!updateToLocation(Loc)) return InsertPointTy(); + // Disable TargetData CodeGen on Device pass. + if (Config.IsTargetDevice.value_or(false)) + return Builder.saveIP(); + Builder.restoreIP(CodeGenIP); bool IsStandAlone = !BodyGenCB; MapInfosTy *MapInfo; @@ -4293,13 +4916,104 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData( return Builder.saveIP(); } -static Function * -createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, - StringRef FuncName, SmallVectorImpl<Value *> &Inputs, - OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc) { +FunctionCallee +OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned, + bool IsGPUDistribute) { + assert((IVSize == 32 || IVSize == 64) && + "IV size is not compatible with the omp runtime"); + RuntimeFunction Name; + if (IsGPUDistribute) + Name = IVSize == 32 + ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4 + : omp::OMPRTL___kmpc_distribute_static_init_4u) + : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8 + : omp::OMPRTL___kmpc_distribute_static_init_8u); + else + Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4 + : omp::OMPRTL___kmpc_for_static_init_4u) + : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8 + : omp::OMPRTL___kmpc_for_static_init_8u); + + return getOrCreateRuntimeFunction(M, Name); +} + +FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize, + bool IVSigned) { + assert((IVSize == 32 || IVSize == 64) && + "IV size is not compatible with the omp runtime"); + RuntimeFunction Name = IVSize == 32 + ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4 + : omp::OMPRTL___kmpc_dispatch_init_4u) + : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8 + : omp::OMPRTL___kmpc_dispatch_init_8u); + + return getOrCreateRuntimeFunction(M, Name); +} + +FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize, + bool IVSigned) { + assert((IVSize == 32 || IVSize == 64) && + "IV size is not compatible with the omp runtime"); + RuntimeFunction Name = IVSize == 32 + ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4 + : omp::OMPRTL___kmpc_dispatch_next_4u) + : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8 + : omp::OMPRTL___kmpc_dispatch_next_8u); + + return getOrCreateRuntimeFunction(M, Name); +} + +FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize, + bool IVSigned) { + assert((IVSize == 32 || IVSize == 64) && + "IV size is not compatible with the omp runtime"); + RuntimeFunction Name = IVSize == 32 + ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4 + : omp::OMPRTL___kmpc_dispatch_fini_4u) + : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8 + : omp::OMPRTL___kmpc_dispatch_fini_8u); + + return getOrCreateRuntimeFunction(M, Name); +} + +static void replaceConstatExprUsesInFuncWithInstr(ConstantExpr *ConstExpr, + Function *Func) { + for (User *User : make_early_inc_range(ConstExpr->users())) + if (auto *Instr = dyn_cast<Instruction>(User)) + if (Instr->getFunction() == Func) + Instr->replaceUsesOfWith(ConstExpr, ConstExpr->getAsInstruction(Instr)); +} + +static void replaceConstantValueUsesInFuncWithInstr(llvm::Value *Input, + Function *Func) { + for (User *User : make_early_inc_range(Input->users())) + if (auto *Const = dyn_cast<Constant>(User)) + if (auto *ConstExpr = dyn_cast<ConstantExpr>(Const)) + replaceConstatExprUsesInFuncWithInstr(ConstExpr, Func); +} + +static Function *createOutlinedFunction( + OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, + SmallVectorImpl<Value *> &Inputs, + OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, + OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) { SmallVector<Type *> ParameterTypes; - for (auto &Arg : Inputs) - ParameterTypes.push_back(Arg->getType()); + if (OMPBuilder.Config.isTargetDevice()) { + // Add the "implicit" runtime argument we use to provide launch specific + // information for target devices. + auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext()); + ParameterTypes.push_back(Int8PtrTy); + + // All parameters to target devices are passed as pointers + // or i64. This assumes 64-bit address spaces/pointers. + for (auto &Arg : Inputs) + ParameterTypes.push_back(Arg->getType()->isPointerTy() + ? Arg->getType() + : Type::getInt64Ty(Builder.getContext())); + } else { + for (auto &Arg : Inputs) + ParameterTypes.push_back(Arg->getType()); + } auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes, /*isVarArg*/ false); @@ -4317,25 +5031,56 @@ createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, if (OMPBuilder.Config.isTargetDevice()) Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false)); - Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP())); + BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock(); // Insert target deinit call in the device compilation pass. + Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP())); if (OMPBuilder.Config.isTargetDevice()) - OMPBuilder.createTargetDeinit(Builder, /*IsSPMD*/ false); + OMPBuilder.createTargetDeinit(Builder); // Insert return instruction. Builder.CreateRetVoid(); + // New Alloca IP at entry point of created device function. + Builder.SetInsertPoint(EntryBB->getFirstNonPHI()); + auto AllocaIP = Builder.saveIP(); + + Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg()); + + // Skip the artificial dyn_ptr on the device. + const auto &ArgRange = + OMPBuilder.Config.isTargetDevice() + ? make_range(Func->arg_begin() + 1, Func->arg_end()) + : Func->args(); + // Rewrite uses of input valus to parameters. - for (auto InArg : zip(Inputs, Func->args())) { + for (auto InArg : zip(Inputs, ArgRange)) { Value *Input = std::get<0>(InArg); Argument &Arg = std::get<1>(InArg); + Value *InputCopy = nullptr; + + Builder.restoreIP( + ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP())); + + // Things like GEP's can come in the form of Constants. Constants and + // ConstantExpr's do not have access to the knowledge of what they're + // contained in, so we must dig a little to find an instruction so we can + // tell if they're used inside of the function we're outlining. We also + // replace the original constant expression with a new instruction + // equivalent; an instruction as it allows easy modification in the + // following loop, as we can now know the constant (instruction) is owned by + // our target function and replaceUsesOfWith can now be invoked on it + // (cannot do this with constants it seems). A brand new one also allows us + // to be cautious as it is perhaps possible the old expression was used + // inside of the function but exists and is used externally (unlikely by the + // nature of a Constant, but still). + replaceConstantValueUsesInFuncWithInstr(Input, Func); // Collect all the instructions for (User *User : make_early_inc_range(Input->users())) - if (auto Instr = dyn_cast<Instruction>(User)) + if (auto *Instr = dyn_cast<Instruction>(User)) if (Instr->getFunction() == Func) - Instr->replaceUsesOfWith(Input, &Arg); + Instr->replaceUsesOfWith(Input, InputCopy); } // Restore insert point. @@ -4344,45 +5089,96 @@ createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, return Func; } -static void -emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, - TargetRegionEntryInfo &EntryInfo, - Function *&OutlinedFn, int32_t NumTeams, - int32_t NumThreads, SmallVectorImpl<Value *> &Inputs, - OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc) { +static void emitTargetOutlinedFunction( + OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, + TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, + Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs, + OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, + OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) { OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction = - [&OMPBuilder, &Builder, &Inputs, &CBFunc](StringRef EntryFnName) { + [&OMPBuilder, &Builder, &Inputs, &CBFunc, + &ArgAccessorFuncCB](StringRef EntryFnName) { return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs, - CBFunc); + CBFunc, ArgAccessorFuncCB); }; - Constant *OutlinedFnID; - OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, - NumTeams, NumThreads, true, OutlinedFn, - OutlinedFnID); + OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true, + OutlinedFn, OutlinedFnID); } -static void emitTargetCall(IRBuilderBase &Builder, Function *OutlinedFn, - SmallVectorImpl<Value *> &Args) { - // TODO: Add kernel launch call - Builder.CreateCall(OutlinedFn, Args); +static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, + OpenMPIRBuilder::InsertPointTy AllocaIP, + Function *OutlinedFn, Constant *OutlinedFnID, + int32_t NumTeams, int32_t NumThreads, + SmallVectorImpl<Value *> &Args, + OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB) { + + OpenMPIRBuilder::TargetDataInfo Info( + /*RequiresDevicePointerInfo=*/false, + /*SeparateBeginEndCalls=*/true); + + OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP()); + OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info, + /*IsNonContiguous=*/true); + + OpenMPIRBuilder::TargetDataRTArgs RTArgs; + OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info, + !MapInfo.Names.empty()); + + // emitKernelLaunch + auto &&EmitTargetCallFallbackCB = + [&](OpenMPIRBuilder::InsertPointTy IP) -> OpenMPIRBuilder::InsertPointTy { + Builder.restoreIP(IP); + Builder.CreateCall(OutlinedFn, Args); + return Builder.saveIP(); + }; + + unsigned NumTargetItems = MapInfo.BasePointers.size(); + // TODO: Use correct device ID + Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF); + Value *NumTeamsVal = Builder.getInt32(NumTeams); + Value *NumThreadsVal = Builder.getInt32(NumThreads); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize); + Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize, + llvm::omp::IdentFlag(0), 0); + // TODO: Use correct NumIterations + Value *NumIterations = Builder.getInt64(0); + // TODO: Use correct DynCGGroupMem + Value *DynCGGroupMem = Builder.getInt32(0); + + bool HasNoWait = false; + + OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations, + NumTeamsVal, NumThreadsVal, + DynCGGroupMem, HasNoWait); + + Builder.restoreIP(OMPBuilder.emitKernelLaunch( + Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, + DeviceID, RTLoc, AllocaIP)); } OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget( - const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy CodeGenIP, - TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads, - SmallVectorImpl<Value *> &Args, TargetBodyGenCallbackTy CBFunc) { + const LocationDescription &Loc, InsertPointTy AllocaIP, + InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, + int32_t NumThreads, SmallVectorImpl<Value *> &Args, + GenMapInfoCallbackTy GenMapInfoCB, + OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc, + OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB) { if (!updateToLocation(Loc)) return InsertPointTy(); Builder.restoreIP(CodeGenIP); Function *OutlinedFn; - emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn, NumTeams, - NumThreads, Args, CBFunc); + Constant *OutlinedFnID; + emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn, + OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB); if (!Config.isTargetDevice()) - emitTargetCall(Builder, OutlinedFn, Args); + emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams, + NumThreads, Args, GenMapInfoCB); + return Builder.saveIP(); } @@ -4417,11 +5213,17 @@ OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name, // variable for possibly changing that to internal or private, or maybe // create different versions of the function for different OMP internal // variables. - auto *GV = new GlobalVariable( - M, Ty, /*IsConstant=*/false, GlobalValue::CommonLinkage, - Constant::getNullValue(Ty), Elem.first(), - /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, AddressSpace); - GV->setAlignment(M.getDataLayout().getABITypeAlign(Ty)); + auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0 + ? GlobalValue::ExternalLinkage + : GlobalValue::CommonLinkage; + auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage, + Constant::getNullValue(Ty), Elem.first(), + /*InsertBefore=*/nullptr, + GlobalValue::NotThreadLocal, AddressSpace); + const DataLayout &DL = M.getDataLayout(); + const llvm::Align TypeAlign = DL.getABITypeAlign(Ty); + const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace); + GV->setAlignment(std::max(TypeAlign, PtrAlign)); Elem.second = GV; } @@ -4513,10 +5315,11 @@ void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder, bool ForEndCall) { assert((!ForEndCall || Info.separateBeginEndCalls()) && "expected region end call to runtime only when end call is separate"); - auto VoidPtrTy = Type::getInt8PtrTy(M.getContext()); - auto VoidPtrPtrTy = VoidPtrTy->getPointerTo(0); + auto UnqualPtrTy = PointerType::getUnqual(M.getContext()); + auto VoidPtrTy = UnqualPtrTy; + auto VoidPtrPtrTy = UnqualPtrTy; auto Int64Ty = Type::getInt64Ty(M.getContext()); - auto Int64PtrTy = Type::getInt64PtrTy(M.getContext()); + auto Int64PtrTy = UnqualPtrTy; if (!Info.NumberOfPtrs) { RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy); @@ -4622,12 +5425,12 @@ void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP, // args[I] = &dims Builder.restoreIP(CodeGenIP); Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast( - DimsAddr, Builder.getInt8PtrTy()); + DimsAddr, Builder.getPtrTy()); Value *P = Builder.CreateConstInBoundsGEP2_32( - ArrayType::get(Builder.getInt8PtrTy(), Info.NumberOfPtrs), + ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0, I); Builder.CreateAlignedStore( - DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getInt8PtrTy())); + DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy())); ++L; } } @@ -4649,7 +5452,7 @@ void OpenMPIRBuilder::emitOffloadingArrays( // Detect if we have any capture size requiring runtime evaluation of the // size so that a constant array could be eventually used. ArrayType *PointerArrayType = - ArrayType::get(Builder.getInt8PtrTy(), Info.NumberOfPtrs); + ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs); Info.RTArgs.BasePointersArray = Builder.CreateAlloca( PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs"); @@ -4665,7 +5468,7 @@ void OpenMPIRBuilder::emitOffloadingArrays( // need to fill up the arrays as we do for the pointers. Type *Int64Ty = Builder.getInt64Ty(); SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(), - ConstantInt::get(Builder.getInt64Ty(), 0)); + ConstantInt::get(Int64Ty, 0)); SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size()); for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) { if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) { @@ -4674,8 +5477,8 @@ void OpenMPIRBuilder::emitOffloadingArrays( static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( CombinedInfo.Types[I] & OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG)) - ConstSizes[I] = ConstantInt::get(Builder.getInt64Ty(), - CombinedInfo.NonContigInfo.Dims[I]); + ConstSizes[I] = + ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]); else ConstSizes[I] = CI; continue; @@ -4708,11 +5511,9 @@ void OpenMPIRBuilder::emitOffloadingArrays( SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes"); Buffer->setAlignment(OffloadSizeAlign); Builder.restoreIP(CodeGenIP); - Value *GblConstPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( - SizesArrayGbl, Int64Ty->getPointerTo()); Builder.CreateMemCpy( Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()), - GblConstPtr, OffloadSizeAlign, + SizesArrayGbl, OffloadSizeAlign, Builder.getIntN( IndexSize, Buffer->getAllocationSize(M.getDataLayout())->getFixedValue())); @@ -4740,8 +5541,8 @@ void OpenMPIRBuilder::emitOffloadingArrays( createOffloadMapnames(CombinedInfo.Names, MapnamesName); Info.RTArgs.MapNamesArray = MapNamesArrayGbl; } else { - Info.RTArgs.MapNamesArray = Constant::getNullValue( - Type::getInt8Ty(Builder.getContext())->getPointerTo()); + Info.RTArgs.MapNamesArray = + Constant::getNullValue(PointerType::getUnqual(Builder.getContext())); } // If there's a present map type modifier, it must not be applied to the end @@ -4762,60 +5563,54 @@ void OpenMPIRBuilder::emitOffloadingArrays( } } + PointerType *PtrTy = Builder.getPtrTy(); for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) { Value *BPVal = CombinedInfo.BasePointers[I]; Value *BP = Builder.CreateConstInBoundsGEP2_32( - ArrayType::get(Builder.getInt8PtrTy(), Info.NumberOfPtrs), - Info.RTArgs.BasePointersArray, 0, I); - BP = Builder.CreatePointerBitCastOrAddrSpaceCast( - BP, BPVal->getType()->getPointerTo(/*AddrSpace=*/0)); - Builder.CreateAlignedStore( - BPVal, BP, M.getDataLayout().getPrefTypeAlign(Builder.getInt8PtrTy())); + ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray, + 0, I); + Builder.CreateAlignedStore(BPVal, BP, + M.getDataLayout().getPrefTypeAlign(PtrTy)); if (Info.requiresDevicePointerInfo()) { if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) { CodeGenIP = Builder.saveIP(); Builder.restoreIP(AllocaIP); - Info.DevicePtrInfoMap[BPVal] = { - BP, Builder.CreateAlloca(Builder.getPtrTy())}; + Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)}; Builder.restoreIP(CodeGenIP); - assert(DeviceAddrCB && - "DeviceAddrCB missing for DevicePtr code generation"); - DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second); + if (DeviceAddrCB) + DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second); } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) { Info.DevicePtrInfoMap[BPVal] = {BP, BP}; - assert(DeviceAddrCB && - "DeviceAddrCB missing for DevicePtr code generation"); - DeviceAddrCB(I, BP); + if (DeviceAddrCB) + DeviceAddrCB(I, BP); } } Value *PVal = CombinedInfo.Pointers[I]; Value *P = Builder.CreateConstInBoundsGEP2_32( - ArrayType::get(Builder.getInt8PtrTy(), Info.NumberOfPtrs), - Info.RTArgs.PointersArray, 0, I); - P = Builder.CreatePointerBitCastOrAddrSpaceCast( - P, PVal->getType()->getPointerTo(/*AddrSpace=*/0)); + ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0, + I); // TODO: Check alignment correct. - Builder.CreateAlignedStore( - PVal, P, M.getDataLayout().getPrefTypeAlign(Builder.getInt8PtrTy())); + Builder.CreateAlignedStore(PVal, P, + M.getDataLayout().getPrefTypeAlign(PtrTy)); if (RuntimeSizes.test(I)) { Value *S = Builder.CreateConstInBoundsGEP2_32( ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray, /*Idx0=*/0, /*Idx1=*/I); - Builder.CreateAlignedStore( - Builder.CreateIntCast(CombinedInfo.Sizes[I], Int64Ty, - /*isSigned=*/true), - S, M.getDataLayout().getPrefTypeAlign(Builder.getInt8PtrTy())); + Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I], + Int64Ty, + /*isSigned=*/true), + S, M.getDataLayout().getPrefTypeAlign(PtrTy)); } // Fill up the mapper array. unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0); - Value *MFunc = ConstantPointerNull::get(Builder.getInt8PtrTy()); + Value *MFunc = ConstantPointerNull::get(PtrTy); if (CustomMapperCB) if (Value *CustomMFunc = CustomMapperCB(I)) - MFunc = Builder.CreatePointerCast(CustomMFunc, Builder.getInt8PtrTy()); + MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy); Value *MAddr = Builder.CreateInBoundsGEP( MappersArray->getAllocatedType(), MappersArray, {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)}); @@ -5007,8 +5802,8 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc, if (!updateToLocation(Loc)) return Loc.IP; - Type *XTy = X.Var->getType(); - assert(XTy->isPointerTy() && "OMP Atomic expects a pointer to target memory"); + assert(X.Var->getType()->isPointerTy() && + "OMP Atomic expects a pointer to target memory"); Type *XElemTy = X.ElemTy; assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || XElemTy->isPointerTy()) && @@ -5019,14 +5814,11 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc, XSt->setAtomic(AO); } else { // We need to bitcast and perform atomic op as integers - unsigned Addrspace = cast<PointerType>(XTy)->getAddressSpace(); IntegerType *IntCastTy = IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); - Value *XBCast = Builder.CreateBitCast( - X.Var, IntCastTy->getPointerTo(Addrspace), "atomic.dst.int.cast"); Value *ExprCast = Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast"); - StoreInst *XSt = Builder.CreateStore(ExprCast, XBCast, X.IsVolatile); + StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile); XSt->setAtomic(AO); } @@ -5406,12 +6198,152 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare( return Builder.saveIP(); } +OpenMPIRBuilder::InsertPointTy +OpenMPIRBuilder::createTeams(const LocationDescription &Loc, + BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower, + Value *NumTeamsUpper, Value *ThreadLimit, + Value *IfExpr) { + if (!updateToLocation(Loc)) + return InsertPointTy(); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); + Function *CurrentFunction = Builder.GetInsertBlock()->getParent(); + + // Outer allocation basicblock is the entry block of the current function. + BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock(); + if (&OuterAllocaBB == Builder.GetInsertBlock()) { + BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry"); + Builder.SetInsertPoint(BodyBB, BodyBB->begin()); + } + + // The current basic block is split into four basic blocks. After outlining, + // they will be mapped as follows: + // ``` + // def current_fn() { + // current_basic_block: + // br label %teams.exit + // teams.exit: + // ; instructions after teams + // } + // + // def outlined_fn() { + // teams.alloca: + // br label %teams.body + // teams.body: + // ; instructions within teams body + // } + // ``` + BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit"); + BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body"); + BasicBlock *AllocaBB = + splitBB(Builder, /*CreateBranch=*/true, "teams.alloca"); + + // Push num_teams + if (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr) { + assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) && + "if lowerbound is non-null, then upperbound must also be non-null " + "for bounds on num_teams"); + + if (NumTeamsUpper == nullptr) + NumTeamsUpper = Builder.getInt32(0); + + if (NumTeamsLower == nullptr) + NumTeamsLower = NumTeamsUpper; + + if (IfExpr) { + assert(IfExpr->getType()->isIntegerTy() && + "argument to if clause must be an integer value"); + + // upper = ifexpr ? upper : 1 + if (IfExpr->getType() != Int1) + IfExpr = Builder.CreateICmpNE(IfExpr, + ConstantInt::get(IfExpr->getType(), 0)); + NumTeamsUpper = Builder.CreateSelect( + IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper"); + + // lower = ifexpr ? lower : 1 + NumTeamsLower = Builder.CreateSelect( + IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower"); + } + + if (ThreadLimit == nullptr) + ThreadLimit = Builder.getInt32(0); + + Value *ThreadNum = getOrCreateThreadID(Ident); + Builder.CreateCall( + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51), + {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit}); + } + // Generate the body of teams. + InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin()); + InsertPointTy CodeGenIP(BodyBB, BodyBB->begin()); + BodyGenCB(AllocaIP, CodeGenIP); + + OutlineInfo OI; + OI.EntryBB = AllocaBB; + OI.ExitBB = ExitBB; + OI.OuterAllocaBB = &OuterAllocaBB; + + // Insert fake values for global tid and bound tid. + std::stack<Instruction *> ToBeDeleted; + InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin()); + OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( + Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true)); + OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( + Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true)); + + OI.PostOutlineCB = [this, Ident, ToBeDeleted](Function &OutlinedFn) mutable { + // The stale call instruction will be replaced with a new call instruction + // for runtime call with the outlined function. + + assert(OutlinedFn.getNumUses() == 1 && + "there must be a single user for the outlined function"); + CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back()); + ToBeDeleted.push(StaleCI); + + assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) && + "Outlined function must have two or three arguments only"); + + bool HasShared = OutlinedFn.arg_size() == 3; + + OutlinedFn.getArg(0)->setName("global.tid.ptr"); + OutlinedFn.getArg(1)->setName("bound.tid.ptr"); + if (HasShared) + OutlinedFn.getArg(2)->setName("data"); + + // Call to the runtime function for teams in the current function. + assert(StaleCI && "Error while outlining - no CallInst user found for the " + "outlined function."); + Builder.SetInsertPoint(StaleCI); + SmallVector<Value *> Args = { + Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn}; + if (HasShared) + Args.push_back(StaleCI->getArgOperand(2)); + Builder.CreateCall(getOrCreateRuntimeFunctionPtr( + omp::RuntimeFunction::OMPRTL___kmpc_fork_teams), + Args); + + while (!ToBeDeleted.empty()) { + ToBeDeleted.top()->eraseFromParent(); + ToBeDeleted.pop(); + } + }; + + addOutlineInfo(std::move(OI)); + + Builder.SetInsertPoint(ExitBB, ExitBB->begin()); + + return Builder.saveIP(); +} + GlobalVariable * OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names, std::string VarName) { llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get( - llvm::ArrayType::get( - llvm::Type::getInt8Ty(M.getContext())->getPointerTo(), Names.size()), + llvm::ArrayType::get(llvm::PointerType::getUnqual(M.getContext()), + Names.size()), Names); auto *MapNamesArrayGlobal = new llvm::GlobalVariable( M, MapNamesArrayInit->getType(), @@ -5460,9 +6392,12 @@ void OpenMPIRBuilder::OutlineInfo::collectBlocks( void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, - GlobalValue::LinkageTypes) { + GlobalValue::LinkageTypes, + StringRef Name) { if (!Config.isGPU()) { - emitOffloadingEntry(ID, Addr->getName(), Size, Flags); + llvm::offloading::emitOffloadingEntry( + M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0, + "omp_offloading_entries"); return; } // TODO: Add support for global variables on the device after declare target @@ -5485,7 +6420,7 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr, // Add a function attribute for the kernel. Fn->addFnAttr(Attribute::get(Ctx, "kernel")); - if (Triple(M.getTargetTriple()).isAMDGCN()) + if (T.isAMDGCN()) Fn->addFnAttr("uniform-work-group-size", "true"); Fn->addFnAttr(Attribute::MustProgress); } @@ -5622,13 +6557,20 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata( // Hidden or internal symbols on the device are not externally visible. // We should not attempt to register them by creating an offloading - // entry. + // entry. Indirect variables are handled separately on the device. if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress())) - if (GV->hasLocalLinkage() || GV->hasHiddenVisibility()) + if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) && + Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect) continue; - createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(), - Flags, CE->getLinkage()); + // Indirect globals need to use a special name that doesn't match the name + // of the associated host global. + if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect) + createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(), + Flags, CE->getLinkage(), CE->getVarName()); + else + createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(), + Flags, CE->getLinkage()); } else { llvm_unreachable("Unsupported entry kind."); @@ -5670,6 +6612,42 @@ OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, std::get<1>(FileIDInfo)); } +unsigned OpenMPIRBuilder::getFlagMemberOffset() { + unsigned Offset = 0; + for (uint64_t Remain = + static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>( + omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF); + !(Remain & 1); Remain = Remain >> 1) + Offset++; + return Offset; +} + +omp::OpenMPOffloadMappingFlags +OpenMPIRBuilder::getMemberOfFlag(unsigned Position) { + // Rotate by getFlagMemberOffset() bits. + return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1) + << getFlagMemberOffset()); +} + +void OpenMPIRBuilder::setCorrectMemberOfFlag( + omp::OpenMPOffloadMappingFlags &Flags, + omp::OpenMPOffloadMappingFlags MemberOfFlag) { + // If the entry is PTR_AND_OBJ but has not been marked with the special + // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be + // marked as MEMBER_OF. + if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>( + Flags & omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ) && + static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>( + (Flags & omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF) != + omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF)) + return; + + // Reset the placeholder value to prepare the flag for the assignment of the + // proper MEMBER_OF value. + Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF; + Flags |= MemberOfFlag; +} + Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar( OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, @@ -5853,6 +6831,63 @@ void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) { } } +void OpenMPIRBuilder::loadOffloadInfoMetadata(StringRef HostFilePath) { + if (HostFilePath.empty()) + return; + + auto Buf = MemoryBuffer::getFile(HostFilePath); + if (std::error_code Err = Buf.getError()) { + report_fatal_error(("error opening host file from host file path inside of " + "OpenMPIRBuilder: " + + Err.message()) + .c_str()); + } + + LLVMContext Ctx; + auto M = expectedToErrorOrAndEmitErrors( + Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx)); + if (std::error_code Err = M.getError()) { + report_fatal_error( + ("error parsing host file inside of OpenMPIRBuilder: " + Err.message()) + .c_str()); + } + + loadOffloadInfoMetadata(*M.get()); +} + +Function *OpenMPIRBuilder::createRegisterRequires(StringRef Name) { + // Skip the creation of the registration function if this is device codegen + if (Config.isTargetDevice()) + return nullptr; + + Builder.ClearInsertionPoint(); + + // Create registration function prototype + auto *RegFnTy = FunctionType::get(Builder.getVoidTy(), {}); + auto *RegFn = Function::Create( + RegFnTy, GlobalVariable::LinkageTypes::InternalLinkage, Name, M); + RegFn->setSection(".text.startup"); + RegFn->addFnAttr(Attribute::NoInline); + RegFn->addFnAttr(Attribute::NoUnwind); + + // Create registration function body + auto *BB = BasicBlock::Create(M.getContext(), "entry", RegFn); + ConstantInt *FlagsVal = + ConstantInt::getSigned(Builder.getInt64Ty(), Config.getRequiresFlags()); + Function *RTLRegFn = getOrCreateRuntimeFunctionPtr( + omp::RuntimeFunction::OMPRTL___tgt_register_requires); + + Builder.SetInsertPoint(BB); + Builder.CreateCall(RTLRegFn, {FlagsVal}); + Builder.CreateRetVoid(); + + return RegFn; +} + +//===----------------------------------------------------------------------===// +// OffloadEntriesInfoManager +//===----------------------------------------------------------------------===// + bool OffloadEntriesInfoManager::empty() const { return OffloadEntriesTargetRegion.empty() && OffloadEntriesDeviceGlobalVar.empty(); @@ -5973,8 +7008,13 @@ void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo( } return; } - OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum, - Addr, VarSize, Flags, Linkage); + if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect) + OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum, + Addr, VarSize, Flags, Linkage, + VarName.str()); + else + OffloadEntriesDeviceGlobalVar.try_emplace( + VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, ""); ++OffloadingEntriesNum; } } @@ -5986,6 +7026,10 @@ void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo( Action(E.getKey(), E.getValue()); } +//===----------------------------------------------------------------------===// +// CanonicalLoopInfo +//===----------------------------------------------------------------------===// + void CanonicalLoopInfo::collectControlBlocks( SmallVectorImpl<BasicBlock *> &BBs) { // We only count those BBs as control block for which we do not need to |
