diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2023-02-11 12:38:04 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2023-02-11 12:38:11 +0000 |
| commit | e3b557809604d036af6e00c60f012c2025b59a5e (patch) | |
| tree | 8a11ba2269a3b669601e2fd41145b174008f4da8 /llvm/lib/Transforms/IPO/OpenMPOpt.cpp | |
| parent | 08e8dd7b9db7bb4a9de26d44c1cbfd24e869c014 (diff) | |
Diffstat (limited to 'llvm/lib/Transforms/IPO/OpenMPOpt.cpp')
| -rw-r--r-- | llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 1621 |
1 files changed, 943 insertions, 678 deletions
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index ef2384faa273..bee154dab10f 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/EnumeratedArray.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" @@ -32,6 +33,7 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/Assumptions.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/GlobalValue.h" @@ -45,12 +47,13 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Attributor.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" #include <algorithm> +#include <optional> +#include <string> using namespace llvm; using namespace omp; @@ -71,6 +74,8 @@ static cl::opt<bool> cl::desc("Disable function internalization."), cl::Hidden, cl::init(false)); +static cl::opt<bool> DeduceICVValues("openmp-deduce-icv-values", + cl::init(false), cl::Hidden); static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden); static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels", @@ -182,13 +187,13 @@ struct AAICVTracker; /// Attributor runs. struct OMPInformationCache : public InformationCache { OMPInformationCache(Module &M, AnalysisGetter &AG, - BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC, + BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC, KernelSet &Kernels) - : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M), + : InformationCache(M, AG, Allocator, CGSCC), OMPBuilder(M), Kernels(Kernels) { OMPBuilder.initialize(); - initializeRuntimeFunctions(); + initializeRuntimeFunctions(M); initializeInternalControlVars(); } @@ -412,7 +417,7 @@ struct OMPInformationCache : public InformationCache { // TODO: We directly convert uses into proper calls and unknown uses. for (Use &U : RFI.Declaration->uses()) { if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) { - if (ModuleSlice.count(UserI->getFunction())) { + if (ModuleSlice.empty() || ModuleSlice.count(UserI->getFunction())) { RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); ++NumUses; } @@ -445,8 +450,7 @@ struct OMPInformationCache : public InformationCache { /// Helper to initialize all runtime function information for those defined /// in OpenMPKinds.def. - void initializeRuntimeFunctions() { - Module &M = *((*ModuleSlice.begin())->getParent()); + void initializeRuntimeFunctions(Module &M) { // Helper macros for handling __VA_ARGS__ in OMP_RTL #define OMP_TYPE(VarName, ...) \ @@ -499,6 +503,18 @@ struct OMPInformationCache : public InformationCache { } #include "llvm/Frontend/OpenMP/OMPKinds.def" + // Remove the `noinline` attribute from `__kmpc`, `ompx::` and `omp_` + // functions, except if `optnone` is present. + if (isOpenMPDevice(M)) { + for (Function &F : M) { + for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"}) + if (F.hasFnAttribute(Attribute::NoInline) && + F.getName().startswith(Prefix) && + !F.hasFnAttribute(Attribute::OptimizeNone)) + F.removeFnAttr(Attribute::NoInline); + } + } + // TODO: We should attach the attributes defined in OMPKinds.def. } @@ -587,6 +603,9 @@ struct KernelInfoState : AbstractState { /// caller is __kmpc_parallel_51. BooleanStateWithSetVector<uint8_t> ParallelLevels; + /// Flag that indicates if the kernel has nested Parallelism + bool NestedParallelism = false; + /// Abstract State interface ///{ @@ -605,6 +624,7 @@ struct KernelInfoState : AbstractState { /// See AbstractState::indicatePessimisticFixpoint(...) ChangeStatus indicatePessimisticFixpoint() override { IsAtFixpoint = true; + ParallelLevels.indicatePessimisticFixpoint(); ReachingKernelEntries.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.indicatePessimisticFixpoint(); ReachedKnownParallelRegions.indicatePessimisticFixpoint(); @@ -615,6 +635,7 @@ struct KernelInfoState : AbstractState { /// See AbstractState::indicateOptimisticFixpoint(...) ChangeStatus indicateOptimisticFixpoint() override { IsAtFixpoint = true; + ParallelLevels.indicateOptimisticFixpoint(); ReachingKernelEntries.indicateOptimisticFixpoint(); SPMDCompatibilityTracker.indicateOptimisticFixpoint(); ReachedKnownParallelRegions.indicateOptimisticFixpoint(); @@ -635,6 +656,8 @@ struct KernelInfoState : AbstractState { return false; if (ReachingKernelEntries != RHS.ReachingKernelEntries) return false; + if (ParallelLevels != RHS.ParallelLevels) + return false; return true; } @@ -672,6 +695,7 @@ struct KernelInfoState : AbstractState { SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker; ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions; ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions; + NestedParallelism |= KIS.NestedParallelism; return *this; } @@ -806,8 +830,6 @@ struct OpenMPOpt { if (remarksEnabled()) analysisGlobalization(); - - Changed |= eliminateBarriers(); } else { if (PrintICVValues) printICVs(); @@ -830,8 +852,6 @@ struct OpenMPOpt { Changed = true; } } - - Changed |= eliminateBarriers(); } return Changed; @@ -843,7 +863,7 @@ struct OpenMPOpt { InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel, ICV_proc_bind}; - for (Function *F : OMPInfoCache.ModuleSlice) { + for (Function *F : SCC) { for (auto ICV : ICVs) { auto ICVInfo = OMPInfoCache.ICVs[ICV]; auto Remark = [&](OptimizationRemarkAnalysis ORA) { @@ -1397,212 +1417,6 @@ private: return Changed; } - /// Eliminates redundant, aligned barriers in OpenMP offloaded kernels. - /// TODO: Make this an AA and expand it to work across blocks and functions. - bool eliminateBarriers() { - bool Changed = false; - - if (DisableOpenMPOptBarrierElimination) - return /*Changed=*/false; - - if (OMPInfoCache.Kernels.empty()) - return /*Changed=*/false; - - enum ImplicitBarrierType { IBT_ENTRY, IBT_EXIT }; - - class BarrierInfo { - Instruction *I; - enum ImplicitBarrierType Type; - - public: - BarrierInfo(enum ImplicitBarrierType Type) : I(nullptr), Type(Type) {} - BarrierInfo(Instruction &I) : I(&I) {} - - bool isImplicit() { return !I; } - - bool isImplicitEntry() { return isImplicit() && Type == IBT_ENTRY; } - - bool isImplicitExit() { return isImplicit() && Type == IBT_EXIT; } - - Instruction *getInstruction() { return I; } - }; - - for (Function *Kernel : OMPInfoCache.Kernels) { - for (BasicBlock &BB : *Kernel) { - SmallVector<BarrierInfo, 8> BarriersInBlock; - SmallPtrSet<Instruction *, 8> BarriersToBeDeleted; - - // Add the kernel entry implicit barrier. - if (&Kernel->getEntryBlock() == &BB) - BarriersInBlock.push_back(IBT_ENTRY); - - // Find implicit and explicit aligned barriers in the same basic block. - for (Instruction &I : BB) { - if (isa<ReturnInst>(I)) { - // Add the implicit barrier when exiting the kernel. - BarriersInBlock.push_back(IBT_EXIT); - continue; - } - CallBase *CB = dyn_cast<CallBase>(&I); - if (!CB) - continue; - - auto IsAlignBarrierCB = [&](CallBase &CB) { - switch (CB.getIntrinsicID()) { - case Intrinsic::nvvm_barrier0: - case Intrinsic::nvvm_barrier0_and: - case Intrinsic::nvvm_barrier0_or: - case Intrinsic::nvvm_barrier0_popc: - return true; - default: - break; - } - return hasAssumption(CB, - KnownAssumptionString("ompx_aligned_barrier")); - }; - - if (IsAlignBarrierCB(*CB)) { - // Add an explicit aligned barrier. - BarriersInBlock.push_back(I); - } - } - - if (BarriersInBlock.size() <= 1) - continue; - - // A barrier in a barrier pair is removeable if all instructions - // between the barriers in the pair are side-effect free modulo the - // barrier operation. - auto IsBarrierRemoveable = [&Kernel](BarrierInfo *StartBI, - BarrierInfo *EndBI) { - assert( - !StartBI->isImplicitExit() && - "Expected start barrier to be other than a kernel exit barrier"); - assert( - !EndBI->isImplicitEntry() && - "Expected end barrier to be other than a kernel entry barrier"); - // If StarBI instructions is null then this the implicit - // kernel entry barrier, so iterate from the first instruction in the - // entry block. - Instruction *I = (StartBI->isImplicitEntry()) - ? &Kernel->getEntryBlock().front() - : StartBI->getInstruction()->getNextNode(); - assert(I && "Expected non-null start instruction"); - Instruction *E = (EndBI->isImplicitExit()) - ? I->getParent()->getTerminator() - : EndBI->getInstruction(); - assert(E && "Expected non-null end instruction"); - - for (; I != E; I = I->getNextNode()) { - if (!I->mayHaveSideEffects() && !I->mayReadFromMemory()) - continue; - - auto IsPotentiallyAffectedByBarrier = - [](Optional<MemoryLocation> Loc) { - const Value *Obj = (Loc && Loc->Ptr) - ? getUnderlyingObject(Loc->Ptr) - : nullptr; - if (!Obj) { - LLVM_DEBUG( - dbgs() - << "Access to unknown location requires barriers\n"); - return true; - } - if (isa<UndefValue>(Obj)) - return false; - if (isa<AllocaInst>(Obj)) - return false; - if (auto *GV = dyn_cast<GlobalVariable>(Obj)) { - if (GV->isConstant()) - return false; - if (GV->isThreadLocal()) - return false; - if (GV->getAddressSpace() == (int)AddressSpace::Local) - return false; - if (GV->getAddressSpace() == (int)AddressSpace::Constant) - return false; - } - LLVM_DEBUG(dbgs() << "Access to '" << *Obj - << "' requires barriers\n"); - return true; - }; - - if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) { - Optional<MemoryLocation> Loc = MemoryLocation::getForDest(MI); - if (IsPotentiallyAffectedByBarrier(Loc)) - return false; - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) { - Optional<MemoryLocation> Loc = - MemoryLocation::getForSource(MTI); - if (IsPotentiallyAffectedByBarrier(Loc)) - return false; - } - continue; - } - - if (auto *LI = dyn_cast<LoadInst>(I)) - if (LI->hasMetadata(LLVMContext::MD_invariant_load)) - continue; - - Optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I); - if (IsPotentiallyAffectedByBarrier(Loc)) - return false; - } - - return true; - }; - - // Iterate barrier pairs and remove an explicit barrier if analysis - // deems it removeable. - for (auto *It = BarriersInBlock.begin(), - *End = BarriersInBlock.end() - 1; - It != End; ++It) { - - BarrierInfo *StartBI = It; - BarrierInfo *EndBI = (It + 1); - - // Cannot remove when both are implicit barriers, continue. - if (StartBI->isImplicit() && EndBI->isImplicit()) - continue; - - if (!IsBarrierRemoveable(StartBI, EndBI)) - continue; - - assert(!(StartBI->isImplicit() && EndBI->isImplicit()) && - "Expected at least one explicit barrier to remove."); - - // Remove an explicit barrier, check first, then second. - if (!StartBI->isImplicit()) { - LLVM_DEBUG(dbgs() << "Remove start barrier " - << *StartBI->getInstruction() << "\n"); - BarriersToBeDeleted.insert(StartBI->getInstruction()); - } else { - LLVM_DEBUG(dbgs() << "Remove end barrier " - << *EndBI->getInstruction() << "\n"); - BarriersToBeDeleted.insert(EndBI->getInstruction()); - } - } - - if (BarriersToBeDeleted.empty()) - continue; - - Changed = true; - for (Instruction *I : BarriersToBeDeleted) { - ++NumBarriersEliminated; - auto Remark = [&](OptimizationRemark OR) { - return OR << "Redundant barrier eliminated."; - }; - - if (EnableVerboseRemarks) - emitRemark<OptimizationRemark>(I, "OMP190", Remark); - I->eraseFromParent(); - } - } - } - - return Changed; - } - void analysisGlobalization() { auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; @@ -1743,10 +1557,14 @@ private: // function. Used for storing information of the async transfer, allowing to // wait on it later. auto &IRBuilder = OMPInfoCache.OMPBuilder; - auto *F = RuntimeCall.getCaller(); - Instruction *FirstInst = &(F->getEntryBlock().front()); - AllocaInst *Handle = new AllocaInst( - IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst); + Function *F = RuntimeCall.getCaller(); + BasicBlock &Entry = F->getEntryBlock(); + IRBuilder.Builder.SetInsertPoint(&Entry, + Entry.getFirstNonPHIOrDbgOrAlloca()); + Value *Handle = IRBuilder.Builder.CreateAlloca( + IRBuilder.AsyncInfo, /*ArraySize=*/nullptr, "handle"); + Handle = + IRBuilder.Builder.CreateAddrSpaceCast(Handle, IRBuilder.AsyncInfoPtr); // Add "issue" runtime call declaration: // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, @@ -1995,7 +1813,7 @@ private: bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); } /// Cache to remember the unique kernel for a function. - DenseMap<Function *, Optional<Kernel>> UniqueKernelMap; + DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap; /// Find the unique kernel that will execute \p F, if any. Kernel getUniqueKernelFor(Function &F); @@ -2055,30 +1873,6 @@ private: [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); }); } - /// RAII struct to temporarily change an RTL function's linkage to external. - /// This prevents it from being mistakenly removed by other optimizations. - struct ExternalizationRAII { - ExternalizationRAII(OMPInformationCache &OMPInfoCache, - RuntimeFunction RFKind) - : Declaration(OMPInfoCache.RFIs[RFKind].Declaration) { - if (!Declaration) - return; - - LinkageType = Declaration->getLinkage(); - Declaration->setLinkage(GlobalValue::ExternalLinkage); - } - - ~ExternalizationRAII() { - if (!Declaration) - return; - - Declaration->setLinkage(LinkageType); - } - - Function *Declaration; - GlobalValue::LinkageTypes LinkageType; - }; - /// The underlying module. Module &M; @@ -2103,21 +1897,6 @@ private: if (SCC.empty()) return false; - // Temporarily make these function have external linkage so the Attributor - // doesn't remove them when we try to look them up later. - ExternalizationRAII Parallel(OMPInfoCache, OMPRTL___kmpc_kernel_parallel); - ExternalizationRAII EndParallel(OMPInfoCache, - OMPRTL___kmpc_kernel_end_parallel); - ExternalizationRAII BarrierSPMD(OMPInfoCache, - OMPRTL___kmpc_barrier_simple_spmd); - ExternalizationRAII BarrierGeneric(OMPInfoCache, - OMPRTL___kmpc_barrier_simple_generic); - ExternalizationRAII ThreadId(OMPInfoCache, - OMPRTL___kmpc_get_hardware_thread_id_in_block); - ExternalizationRAII NumThreads( - OMPInfoCache, OMPRTL___kmpc_get_hardware_num_threads_in_block); - ExternalizationRAII WarpSize(OMPInfoCache, OMPRTL___kmpc_get_warp_size); - registerAAs(IsModulePass); ChangeStatus Changed = A.run(); @@ -2131,17 +1910,22 @@ private: void registerFoldRuntimeCall(RuntimeFunction RF); /// Populate the Attributor with abstract attribute opportunities in the - /// function. + /// functions. void registerAAs(bool IsModulePass); + +public: + /// Callback to register AAs for live functions, including internal functions + /// marked live during the traversal. + static void registerAAsForFunction(Attributor &A, const Function &F); }; Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { - if (!OMPInfoCache.ModuleSlice.count(&F)) + if (!OMPInfoCache.ModuleSlice.empty() && !OMPInfoCache.ModuleSlice.count(&F)) return nullptr; // Use a scope to keep the lifetime of the CachedKernel short. { - Optional<Kernel> &CachedKernel = UniqueKernelMap[&F]; + std::optional<Kernel> &CachedKernel = UniqueKernelMap[&F]; if (CachedKernel) return *CachedKernel; @@ -2327,16 +2111,16 @@ struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> { static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A); /// Return the value with which \p I can be replaced for specific \p ICV. - virtual Optional<Value *> getReplacementValue(InternalControlVar ICV, - const Instruction *I, - Attributor &A) const { - return None; + virtual std::optional<Value *> getReplacementValue(InternalControlVar ICV, + const Instruction *I, + Attributor &A) const { + return std::nullopt; } /// Return an assumed unique ICV value if a single candidate is found. If - /// there cannot be one, return a nullptr. If it is not clear yet, return the - /// Optional::NoneType. - virtual Optional<Value *> + /// there cannot be one, return a nullptr. If it is not clear yet, return + /// std::nullopt. + virtual std::optional<Value *> getUniqueReplacementValue(InternalControlVar ICV) const = 0; // Currently only nthreads is being tracked. @@ -2402,7 +2186,7 @@ struct AAICVTrackerFunction : public AAICVTracker { }; auto CallCheck = [&](Instruction &I) { - Optional<Value *> ReplVal = getValueForCall(A, I, ICV); + std::optional<Value *> ReplVal = getValueForCall(A, I, ICV); if (ReplVal && ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) HasChanged = ChangeStatus::CHANGED; @@ -2429,13 +2213,13 @@ struct AAICVTrackerFunction : public AAICVTracker { /// Helper to check if \p I is a call and get the value for it if it is /// unique. - Optional<Value *> getValueForCall(Attributor &A, const Instruction &I, - InternalControlVar &ICV) const { + std::optional<Value *> getValueForCall(Attributor &A, const Instruction &I, + InternalControlVar &ICV) const { const auto *CB = dyn_cast<CallBase>(&I); if (!CB || CB->hasFnAttr("no_openmp") || CB->hasFnAttr("no_openmp_routines")) - return None; + return std::nullopt; auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter]; @@ -2446,7 +2230,7 @@ struct AAICVTrackerFunction : public AAICVTracker { if (CalledFunction == nullptr) return nullptr; if (CalledFunction == GetterRFI.Declaration) - return None; + return std::nullopt; if (CalledFunction == SetterRFI.Declaration) { if (ICVReplacementValuesMap[ICV].count(&I)) return ICVReplacementValuesMap[ICV].lookup(&I); @@ -2462,7 +2246,7 @@ struct AAICVTrackerFunction : public AAICVTracker { *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED); if (ICVTrackingAA.isAssumedTracked()) { - Optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV); + std::optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV); if (!URV || (*URV && AA::isValidAtPosition(AA::ValueAndContext(**URV, I), OMPInfoCache))) return URV; @@ -2472,16 +2256,16 @@ struct AAICVTrackerFunction : public AAICVTracker { return nullptr; } - // We don't check unique value for a function, so return None. - Optional<Value *> + // We don't check unique value for a function, so return std::nullopt. + std::optional<Value *> getUniqueReplacementValue(InternalControlVar ICV) const override { - return None; + return std::nullopt; } /// Return the value with which \p I can be replaced for specific \p ICV. - Optional<Value *> getReplacementValue(InternalControlVar ICV, - const Instruction *I, - Attributor &A) const override { + std::optional<Value *> getReplacementValue(InternalControlVar ICV, + const Instruction *I, + Attributor &A) const override { const auto &ValuesMap = ICVReplacementValuesMap[ICV]; if (ValuesMap.count(I)) return ValuesMap.lookup(I); @@ -2490,7 +2274,7 @@ struct AAICVTrackerFunction : public AAICVTracker { SmallPtrSet<const Instruction *, 16> Visited; Worklist.push_back(I); - Optional<Value *> ReplVal; + std::optional<Value *> ReplVal; while (!Worklist.empty()) { const Instruction *CurrInst = Worklist.pop_back_val(); @@ -2503,7 +2287,7 @@ struct AAICVTrackerFunction : public AAICVTracker { // ICV. while ((CurrInst = CurrInst->getPrevNode())) { if (ValuesMap.count(CurrInst)) { - Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst); + std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst); // Unknown value, track new. if (!ReplVal) { ReplVal = NewReplVal; @@ -2518,7 +2302,7 @@ struct AAICVTrackerFunction : public AAICVTracker { break; } - Optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV); + std::optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV); if (!NewReplVal) continue; @@ -2566,12 +2350,12 @@ struct AAICVTrackerFunctionReturned : AAICVTracker { } // Map of ICV to their values at specific program point. - EnumeratedArray<Optional<Value *>, InternalControlVar, + EnumeratedArray<std::optional<Value *>, InternalControlVar, InternalControlVar::ICV___last> ICVReplacementValuesMap; /// Return the value with which \p I can be replaced for specific \p ICV. - Optional<Value *> + std::optional<Value *> getUniqueReplacementValue(InternalControlVar ICV) const override { return ICVReplacementValuesMap[ICV]; } @@ -2585,11 +2369,11 @@ struct AAICVTrackerFunctionReturned : AAICVTracker { return indicatePessimisticFixpoint(); for (InternalControlVar ICV : TrackableICVs) { - Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; - Optional<Value *> UniqueICVValue; + std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; + std::optional<Value *> UniqueICVValue; auto CheckReturnInst = [&](Instruction &I) { - Optional<Value *> NewReplVal = + std::optional<Value *> NewReplVal = ICVTrackingAA.getReplacementValue(ICV, &I, A); // If we found a second ICV value there is no unique returned value. @@ -2660,7 +2444,7 @@ struct AAICVTrackerCallSite : AAICVTracker { void trackStatistics() const override {} InternalControlVar AssociatedICV; - Optional<Value *> ReplVal; + std::optional<Value *> ReplVal; ChangeStatus updateImpl(Attributor &A) override { const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( @@ -2670,7 +2454,7 @@ struct AAICVTrackerCallSite : AAICVTracker { if (!ICVTrackingAA.isAssumedTracked()) return indicatePessimisticFixpoint(); - Optional<Value *> NewReplVal = + std::optional<Value *> NewReplVal = ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A); if (ReplVal == NewReplVal) @@ -2682,7 +2466,7 @@ struct AAICVTrackerCallSite : AAICVTracker { // Return the value with which associated value can be replaced for specific // \p ICV. - Optional<Value *> + std::optional<Value *> getUniqueReplacementValue(InternalControlVar ICV) const override { return ReplVal; } @@ -2706,13 +2490,13 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker { } // Map of ICV to their values at specific program point. - EnumeratedArray<Optional<Value *>, InternalControlVar, + EnumeratedArray<std::optional<Value *>, InternalControlVar, InternalControlVar::ICV___last> ICVReplacementValuesMap; /// Return the value with which associated value can be replaced for specific /// \p ICV. - Optional<Value *> + std::optional<Value *> getUniqueReplacementValue(InternalControlVar ICV) const override { return ICVReplacementValuesMap[ICV]; } @@ -2728,8 +2512,8 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker { return indicatePessimisticFixpoint(); for (InternalControlVar ICV : TrackableICVs) { - Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; - Optional<Value *> NewReplVal = + std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; + std::optional<Value *> NewReplVal = ICVTrackingAA.getUniqueReplacementValue(ICV); if (ReplVal == NewReplVal) @@ -2746,77 +2530,216 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A) : AAExecutionDomain(IRP, A) {} + ~AAExecutionDomainFunction() { + delete RPOT; + } + + void initialize(Attributor &A) override { + if (getAnchorScope()->isDeclaration()) { + indicatePessimisticFixpoint(); + return; + } + RPOT = new ReversePostOrderTraversal<Function *>(getAnchorScope()); + } + const std::string getAsStr() const override { - return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) + - "/" + std::to_string(NumBBs) + " BBs thread 0 only."; + unsigned TotalBlocks = 0, InitialThreadBlocks = 0; + for (auto &It : BEDMap) { + TotalBlocks++; + InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly; + } + return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) + "/" + + std::to_string(TotalBlocks) + " executed by initial thread only"; } /// See AbstractAttribute::trackStatistics(). void trackStatistics() const override {} - void initialize(Attributor &A) override { - Function *F = getAnchorScope(); - for (const auto &BB : *F) - SingleThreadedBBs.insert(&BB); - NumBBs = SingleThreadedBBs.size(); - } - ChangeStatus manifest(Attributor &A) override { LLVM_DEBUG({ - for (const BasicBlock *BB : SingleThreadedBBs) + for (const BasicBlock &BB : *getAnchorScope()) { + if (!isExecutedByInitialThreadOnly(BB)) + continue; dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " " - << BB->getName() << " is executed by a single thread.\n"; + << BB.getName() << " is executed by a single thread.\n"; + } }); - return ChangeStatus::UNCHANGED; - } - ChangeStatus updateImpl(Attributor &A) override; + ChangeStatus Changed = ChangeStatus::UNCHANGED; + + if (DisableOpenMPOptBarrierElimination) + return Changed; - /// Check if an instruction is executed by a single thread. - bool isExecutedByInitialThreadOnly(const Instruction &I) const override { - return isExecutedByInitialThreadOnly(*I.getParent()); + SmallPtrSet<CallBase *, 16> DeletedBarriers; + auto HandleAlignedBarrier = [&](CallBase *CB) { + const ExecutionDomainTy &ED = CEDMap[CB]; + if (!ED.IsReachedFromAlignedBarrierOnly || + ED.EncounteredNonLocalSideEffect) + return; + + // We can remove this barrier, if it is one, or all aligned barriers + // reaching the kernel end. In the latter case we can transitively work + // our way back until we find a barrier that guards a side-effect if we + // are dealing with the kernel end here. + if (CB) { + DeletedBarriers.insert(CB); + A.deleteAfterManifest(*CB); + ++NumBarriersEliminated; + Changed = ChangeStatus::CHANGED; + } else if (!ED.AlignedBarriers.empty()) { + NumBarriersEliminated += ED.AlignedBarriers.size(); + Changed = ChangeStatus::CHANGED; + SmallVector<CallBase *> Worklist(ED.AlignedBarriers.begin(), + ED.AlignedBarriers.end()); + SmallSetVector<CallBase *, 16> Visited; + while (!Worklist.empty()) { + CallBase *LastCB = Worklist.pop_back_val(); + if (!Visited.insert(LastCB)) + continue; + if (!DeletedBarriers.count(LastCB)) { + A.deleteAfterManifest(*LastCB); + continue; + } + // The final aligned barrier (LastCB) reaching the kernel end was + // removed already. This means we can go one step further and remove + // the barriers encoutered last before (LastCB). + const ExecutionDomainTy &LastED = CEDMap[LastCB]; + Worklist.append(LastED.AlignedBarriers.begin(), + LastED.AlignedBarriers.end()); + } + } + + // If we actually eliminated a barrier we need to eliminate the associated + // llvm.assumes as well to avoid creating UB. + if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty())) + for (auto *AssumeCB : ED.EncounteredAssumes) + A.deleteAfterManifest(*AssumeCB); + }; + + for (auto *CB : AlignedBarriers) + HandleAlignedBarrier(CB); + + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); + // Handle the "kernel end barrier" for kernels too. + if (OMPInfoCache.Kernels.count(getAnchorScope())) + HandleAlignedBarrier(nullptr); + + return Changed; } + /// Merge barrier and assumption information from \p PredED into the successor + /// \p ED. + void + mergeInPredecessorBarriersAndAssumptions(Attributor &A, ExecutionDomainTy &ED, + const ExecutionDomainTy &PredED); + + /// Merge all information from \p PredED into the successor \p ED. If + /// \p InitialEdgeOnly is set, only the initial edge will enter the block + /// represented by \p ED from this predecessor. + void mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED, + const ExecutionDomainTy &PredED, + bool InitialEdgeOnly = false); + + /// Accumulate information for the entry block in \p EntryBBED. + void handleEntryBB(Attributor &A, ExecutionDomainTy &EntryBBED); + + /// See AbstractAttribute::updateImpl. + ChangeStatus updateImpl(Attributor &A) override; + + /// Query interface, see AAExecutionDomain + ///{ bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override { - return isValidState() && SingleThreadedBBs.contains(&BB); + if (!isValidState()) + return false; + return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly; } - /// Set of basic blocks that are executed by a single thread. - SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs; + bool isExecutedInAlignedRegion(Attributor &A, + const Instruction &I) const override { + if (!isValidState() || isa<CallBase>(I)) + return false; - /// Total number of basic blocks in this function. - long unsigned NumBBs = 0; -}; + const Instruction *CurI; -ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { - Function *F = getAnchorScope(); - ReversePostOrderTraversal<Function *> RPOT(F); - auto NumSingleThreadedBBs = SingleThreadedBBs.size(); + // Check forward until a call or the block end is reached. + CurI = &I; + do { + auto *CB = dyn_cast<CallBase>(CurI); + if (!CB) + continue; + const auto &It = CEDMap.find(CB); + if (It == CEDMap.end()) + continue; + if (!It->getSecond().IsReachedFromAlignedBarrierOnly) + return false; + } while ((CurI = CurI->getNextNonDebugInstruction())); - bool AllCallSitesKnown; - auto PredForCallSite = [&](AbstractCallSite ACS) { - const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>( - *this, IRPosition::function(*ACS.getInstruction()->getFunction()), - DepClassTy::REQUIRED); - return ACS.isDirectCall() && - ExecutionDomainAA.isExecutedByInitialThreadOnly( - *ACS.getInstruction()); - }; + if (!CurI && !BEDMap.lookup(I.getParent()).IsReachedFromAlignedBarrierOnly) + return false; - if (!A.checkForAllCallSites(PredForCallSite, *this, - /* RequiresAllCallSites */ true, - AllCallSitesKnown)) - SingleThreadedBBs.remove(&F->getEntryBlock()); + // Check backward until a call or the block beginning is reached. + CurI = &I; + do { + auto *CB = dyn_cast<CallBase>(CurI); + if (!CB) + continue; + const auto &It = CEDMap.find(CB); + if (It == CEDMap.end()) + continue; + if (!AA::isNoSyncInst(A, *CB, *this)) { + if (It->getSecond().IsReachedFromAlignedBarrierOnly) + break; + return false; + } - auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); - auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; + Function *Callee = CB->getCalledFunction(); + if (!Callee || Callee->isDeclaration()) + return false; + const auto &EDAA = A.getAAFor<AAExecutionDomain>( + *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL); + if (!EDAA.getState().isValidState()) + return false; + if (!EDAA.getFunctionExecutionDomain().IsReachedFromAlignedBarrierOnly) + return false; + break; + } while ((CurI = CurI->getPrevNonDebugInstruction())); + + if (!CurI && + !llvm::all_of( + predecessors(I.getParent()), [&](const BasicBlock *PredBB) { + return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly; + })) { + return false; + } + + // On neither traversal we found a anything but aligned barriers. + return true; + } + + ExecutionDomainTy getExecutionDomain(const BasicBlock &BB) const override { + assert(isValidState() && + "No request should be made against an invalid state!"); + return BEDMap.lookup(&BB); + } + ExecutionDomainTy getExecutionDomain(const CallBase &CB) const override { + assert(isValidState() && + "No request should be made against an invalid state!"); + return CEDMap.lookup(&CB); + } + ExecutionDomainTy getFunctionExecutionDomain() const override { + assert(isValidState() && + "No request should be made against an invalid state!"); + return BEDMap.lookup(nullptr); + } + ///} // Check if the edge into the successor block contains a condition that only // lets the main thread execute it. - auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) { + static bool isInitialThreadOnlyEdge(Attributor &A, BranchInst *Edge, + BasicBlock &SuccessorBB) { if (!Edge || !Edge->isConditional()) return false; - if (Edge->getSuccessor(0) != SuccessorBB) + if (Edge->getSuccessor(0) != &SuccessorBB) return false; auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition()); @@ -2830,6 +2753,8 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!) if (C->isAllOnesValue()) { auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0)); + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); + auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr; if (!CB) return false; @@ -2853,30 +2778,335 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { return false; }; - // Merge all the predecessor states into the current basic block. A basic - // block is executed by a single thread if all of its predecessors are. - auto MergePredecessorStates = [&](BasicBlock *BB) { - if (pred_empty(BB)) - return SingleThreadedBBs.contains(BB); + /// Mapping containing information per block. + DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap; + DenseMap<const CallBase *, ExecutionDomainTy> CEDMap; + SmallSetVector<CallBase *, 16> AlignedBarriers; + + ReversePostOrderTraversal<Function *> *RPOT = nullptr; +}; + +void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions( + Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) { + for (auto *EA : PredED.EncounteredAssumes) + ED.addAssumeInst(A, *EA); + + for (auto *AB : PredED.AlignedBarriers) + ED.addAlignedBarrier(A, *AB); +} - bool IsInitialThread = true; - for (BasicBlock *PredBB : predecessors(BB)) { - if (!IsInitialThreadOnly(dyn_cast<BranchInst>(PredBB->getTerminator()), - BB)) - IsInitialThread &= SingleThreadedBBs.contains(PredBB); +void AAExecutionDomainFunction::mergeInPredecessor( + Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED, + bool InitialEdgeOnly) { + ED.IsExecutedByInitialThreadOnly = + InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly && + ED.IsExecutedByInitialThreadOnly); + + ED.IsReachedFromAlignedBarrierOnly = ED.IsReachedFromAlignedBarrierOnly && + PredED.IsReachedFromAlignedBarrierOnly; + ED.EncounteredNonLocalSideEffect = + ED.EncounteredNonLocalSideEffect | PredED.EncounteredNonLocalSideEffect; + if (ED.IsReachedFromAlignedBarrierOnly) + mergeInPredecessorBarriersAndAssumptions(A, ED, PredED); + else + ED.clearAssumeInstAndAlignedBarriers(); +} + +void AAExecutionDomainFunction::handleEntryBB(Attributor &A, + ExecutionDomainTy &EntryBBED) { + SmallVector<ExecutionDomainTy> PredExecDomains; + auto PredForCallSite = [&](AbstractCallSite ACS) { + const auto &EDAA = A.getAAFor<AAExecutionDomain>( + *this, IRPosition::function(*ACS.getInstruction()->getFunction()), + DepClassTy::OPTIONAL); + if (!EDAA.getState().isValidState()) + return false; + PredExecDomains.emplace_back( + EDAA.getExecutionDomain(*cast<CallBase>(ACS.getInstruction()))); + return true; + }; + + bool AllCallSitesKnown; + if (A.checkForAllCallSites(PredForCallSite, *this, + /* RequiresAllCallSites */ true, + AllCallSitesKnown)) { + for (const auto &PredED : PredExecDomains) + mergeInPredecessor(A, EntryBBED, PredED); + + } else { + // We could not find all predecessors, so this is either a kernel or a + // function with external linkage (or with some other weird uses). + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); + if (OMPInfoCache.Kernels.count(getAnchorScope())) { + EntryBBED.IsExecutedByInitialThreadOnly = false; + EntryBBED.IsReachedFromAlignedBarrierOnly = true; + EntryBBED.EncounteredNonLocalSideEffect = false; + } else { + EntryBBED.IsExecutedByInitialThreadOnly = false; + EntryBBED.IsReachedFromAlignedBarrierOnly = false; + EntryBBED.EncounteredNonLocalSideEffect = true; } + } + + auto &FnED = BEDMap[nullptr]; + FnED.IsReachingAlignedBarrierOnly &= + EntryBBED.IsReachedFromAlignedBarrierOnly; +} - return IsInitialThread; +ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { + + bool Changed = false; + + // Helper to deal with an aligned barrier encountered during the forward + // traversal. \p CB is the aligned barrier, \p ED is the execution domain when + // it was encountered. + auto HandleAlignedBarrier = [&](CallBase *CB, ExecutionDomainTy &ED) { + if (CB) + Changed |= AlignedBarriers.insert(CB); + // First, update the barrier ED kept in the separate CEDMap. + auto &CallED = CEDMap[CB]; + mergeInPredecessor(A, CallED, ED); + // Next adjust the ED we use for the traversal. + ED.EncounteredNonLocalSideEffect = false; + ED.IsReachedFromAlignedBarrierOnly = true; + // Aligned barrier collection has to come last. + ED.clearAssumeInstAndAlignedBarriers(); + if (CB) + ED.addAlignedBarrier(A, *CB); + }; + + auto &LivenessAA = + A.getAAFor<AAIsDead>(*this, getIRPosition(), DepClassTy::OPTIONAL); + + // Set \p R to \V and report true if that changed \p R. + auto SetAndRecord = [&](bool &R, bool V) { + bool Eq = (R == V); + R = V; + return !Eq; }; - for (auto *BB : RPOT) { - if (!MergePredecessorStates(BB)) - SingleThreadedBBs.remove(BB); + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); + + Function *F = getAnchorScope(); + BasicBlock &EntryBB = F->getEntryBlock(); + bool IsKernel = OMPInfoCache.Kernels.count(F); + + SmallVector<Instruction *> SyncInstWorklist; + for (auto &RIt : *RPOT) { + BasicBlock &BB = *RIt; + + bool IsEntryBB = &BB == &EntryBB; + // TODO: We use local reasoning since we don't have a divergence analysis + // running as well. We could basically allow uniform branches here. + bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel; + ExecutionDomainTy ED; + // Propagate "incoming edges" into information about this block. + if (IsEntryBB) { + handleEntryBB(A, ED); + } else { + // For live non-entry blocks we only propagate + // information via live edges. + if (LivenessAA.isAssumedDead(&BB)) + continue; + + for (auto *PredBB : predecessors(&BB)) { + if (LivenessAA.isEdgeDead(PredBB, &BB)) + continue; + bool InitialEdgeOnly = isInitialThreadOnlyEdge( + A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB); + mergeInPredecessor(A, ED, BEDMap[PredBB], InitialEdgeOnly); + } + } + + // Now we traverse the block, accumulate effects in ED and attach + // information to calls. + for (Instruction &I : BB) { + bool UsedAssumedInformation; + if (A.isAssumedDead(I, *this, &LivenessAA, UsedAssumedInformation, + /* CheckBBLivenessOnly */ false, DepClassTy::OPTIONAL, + /* CheckForDeadStore */ true)) + continue; + + // Asummes and "assume-like" (dbg, lifetime, ...) are handled first, the + // former is collected the latter is ignored. + if (auto *II = dyn_cast<IntrinsicInst>(&I)) { + if (auto *AI = dyn_cast_or_null<AssumeInst>(II)) { + ED.addAssumeInst(A, *AI); + continue; + } + // TODO: Should we also collect and delete lifetime markers? + if (II->isAssumeLikeIntrinsic()) + continue; + } + + auto *CB = dyn_cast<CallBase>(&I); + bool IsNoSync = AA::isNoSyncInst(A, I, *this); + bool IsAlignedBarrier = + !IsNoSync && CB && + AANoSync::isAlignedBarrier(*CB, AlignedBarrierLastInBlock); + + AlignedBarrierLastInBlock &= IsNoSync; + + // Next we check for calls. Aligned barriers are handled + // explicitly, everything else is kept for the backward traversal and will + // also affect our state. + if (CB) { + if (IsAlignedBarrier) { + HandleAlignedBarrier(CB, ED); + AlignedBarrierLastInBlock = true; + continue; + } + + // Check the pointer(s) of a memory intrinsic explicitly. + if (isa<MemIntrinsic>(&I)) { + if (!ED.EncounteredNonLocalSideEffect && + AA::isPotentiallyAffectedByBarrier(A, I, *this)) + ED.EncounteredNonLocalSideEffect = true; + if (!IsNoSync) { + ED.IsReachedFromAlignedBarrierOnly = false; + SyncInstWorklist.push_back(&I); + } + continue; + } + + // Record how we entered the call, then accumulate the effect of the + // call in ED for potential use by the callee. + auto &CallED = CEDMap[CB]; + mergeInPredecessor(A, CallED, ED); + + // If we have a sync-definition we can check if it starts/ends in an + // aligned barrier. If we are unsure we assume any sync breaks + // alignment. + Function *Callee = CB->getCalledFunction(); + if (!IsNoSync && Callee && !Callee->isDeclaration()) { + const auto &EDAA = A.getAAFor<AAExecutionDomain>( + *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL); + if (EDAA.getState().isValidState()) { + const auto &CalleeED = EDAA.getFunctionExecutionDomain(); + ED.IsReachedFromAlignedBarrierOnly = + CalleeED.IsReachedFromAlignedBarrierOnly; + AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly; + if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly) + ED.EncounteredNonLocalSideEffect |= + CalleeED.EncounteredNonLocalSideEffect; + else + ED.EncounteredNonLocalSideEffect = + CalleeED.EncounteredNonLocalSideEffect; + if (!CalleeED.IsReachingAlignedBarrierOnly) + SyncInstWorklist.push_back(&I); + if (CalleeED.IsReachedFromAlignedBarrierOnly) + mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED); + continue; + } + } + ED.IsReachedFromAlignedBarrierOnly = + IsNoSync && ED.IsReachedFromAlignedBarrierOnly; + AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly; + ED.EncounteredNonLocalSideEffect |= !CB->doesNotAccessMemory(); + if (!IsNoSync) + SyncInstWorklist.push_back(&I); + } + + if (!I.mayHaveSideEffects() && !I.mayReadFromMemory()) + continue; + + // If we have a callee we try to use fine-grained information to + // determine local side-effects. + if (CB) { + const auto &MemAA = A.getAAFor<AAMemoryLocation>( + *this, IRPosition::callsite_function(*CB), DepClassTy::OPTIONAL); + + auto AccessPred = [&](const Instruction *I, const Value *Ptr, + AAMemoryLocation::AccessKind, + AAMemoryLocation::MemoryLocationsKind) { + return !AA::isPotentiallyAffectedByBarrier(A, {Ptr}, *this, I); + }; + if (MemAA.getState().isValidState() && + MemAA.checkForAllAccessesToMemoryKind( + AccessPred, AAMemoryLocation::ALL_LOCATIONS)) + continue; + } + + if (!I.mayHaveSideEffects() && OMPInfoCache.isOnlyUsedByAssume(I)) + continue; + + if (auto *LI = dyn_cast<LoadInst>(&I)) + if (LI->hasMetadata(LLVMContext::MD_invariant_load)) + continue; + + if (!ED.EncounteredNonLocalSideEffect && + AA::isPotentiallyAffectedByBarrier(A, I, *this)) + ED.EncounteredNonLocalSideEffect = true; + } + + if (!isa<UnreachableInst>(BB.getTerminator()) && + !BB.getTerminator()->getNumSuccessors()) { + + auto &FnED = BEDMap[nullptr]; + mergeInPredecessor(A, FnED, ED); + + if (IsKernel) + HandleAlignedBarrier(nullptr, ED); + } + + ExecutionDomainTy &StoredED = BEDMap[&BB]; + ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly; + + // Check if we computed anything different as part of the forward + // traversal. We do not take assumptions and aligned barriers into account + // as they do not influence the state we iterate. Backward traversal values + // are handled later on. + if (ED.IsExecutedByInitialThreadOnly != + StoredED.IsExecutedByInitialThreadOnly || + ED.IsReachedFromAlignedBarrierOnly != + StoredED.IsReachedFromAlignedBarrierOnly || + ED.EncounteredNonLocalSideEffect != + StoredED.EncounteredNonLocalSideEffect) + Changed = true; + + // Update the state with the new value. + StoredED = std::move(ED); } - return (NumSingleThreadedBBs == SingleThreadedBBs.size()) - ? ChangeStatus::UNCHANGED - : ChangeStatus::CHANGED; + // Propagate (non-aligned) sync instruction effects backwards until the + // entry is hit or an aligned barrier. + SmallSetVector<BasicBlock *, 16> Visited; + while (!SyncInstWorklist.empty()) { + Instruction *SyncInst = SyncInstWorklist.pop_back_val(); + Instruction *CurInst = SyncInst; + bool HitAlignedBarrier = false; + while ((CurInst = CurInst->getPrevNode())) { + auto *CB = dyn_cast<CallBase>(CurInst); + if (!CB) + continue; + auto &CallED = CEDMap[CB]; + if (SetAndRecord(CallED.IsReachingAlignedBarrierOnly, false)) + Changed = true; + HitAlignedBarrier = AlignedBarriers.count(CB); + if (HitAlignedBarrier) + break; + } + if (HitAlignedBarrier) + continue; + BasicBlock *SyncBB = SyncInst->getParent(); + for (auto *PredBB : predecessors(SyncBB)) { + if (LivenessAA.isEdgeDead(PredBB, SyncBB)) + continue; + if (!Visited.insert(PredBB)) + continue; + SyncInstWorklist.push_back(PredBB->getTerminator()); + auto &PredED = BEDMap[PredBB]; + if (SetAndRecord(PredED.IsReachingAlignedBarrierOnly, false)) + Changed = true; + } + if (SyncBB != &EntryBB) + continue; + auto &FnED = BEDMap[nullptr]; + if (SetAndRecord(FnED.IsReachingAlignedBarrierOnly, false)) + Changed = true; + } + + return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; } /// Try to replace memory allocation calls called by a single thread with a @@ -2955,12 +3185,18 @@ struct AAHeapToSharedFunction : public AAHeapToShared { auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + if (!RFI.Declaration) + return; Attributor::SimplifictionCallbackTy SCB = [](const IRPosition &, const AbstractAttribute *, - bool &) -> Optional<Value *> { return nullptr; }; + bool &) -> std::optional<Value *> { return nullptr; }; + + Function *F = getAnchorScope(); for (User *U : RFI.Declaration->users()) if (CallBase *CB = dyn_cast<CallBase>(U)) { + if (CB->getFunction() != F) + continue; MallocCalls.insert(CB); A.registerSimplificationCallback(IRPosition::callsite_returned(*CB), SCB); @@ -3057,20 +3293,33 @@ struct AAHeapToSharedFunction : public AAHeapToShared { } ChangeStatus updateImpl(Attributor &A) override { + if (MallocCalls.empty()) + return indicatePessimisticFixpoint(); auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + if (!RFI.Declaration) + return ChangeStatus::UNCHANGED; + Function *F = getAnchorScope(); auto NumMallocCalls = MallocCalls.size(); // Only consider malloc calls executed by a single thread with a constant. for (User *U : RFI.Declaration->users()) { - const auto &ED = A.getAAFor<AAExecutionDomain>( - *this, IRPosition::function(*F), DepClassTy::REQUIRED); - if (CallBase *CB = dyn_cast<CallBase>(U)) - if (!isa<ConstantInt>(CB->getArgOperand(0)) || - !ED.isExecutedByInitialThreadOnly(*CB)) + if (CallBase *CB = dyn_cast<CallBase>(U)) { + if (CB->getCaller() != F) + continue; + if (!MallocCalls.count(CB)) + continue; + if (!isa<ConstantInt>(CB->getArgOperand(0))) { + MallocCalls.remove(CB); + continue; + } + const auto &ED = A.getAAFor<AAExecutionDomain>( + *this, IRPosition::function(*F), DepClassTy::REQUIRED); + if (!ED.isExecutedByInitialThreadOnly(*CB)) MallocCalls.remove(CB); + } } findPotentialRemovedFreeCalls(A); @@ -3115,6 +3364,10 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> { ", #Reaching Kernels: " + (ReachingKernelEntries.isValidState() ? std::to_string(ReachingKernelEntries.size()) + : "<invalid>") + + ", #ParLevels: " + + (ParallelLevels.isValidState() + ? std::to_string(ParallelLevels.size()) : "<invalid>"); } @@ -3202,7 +3455,7 @@ struct AAKernelInfoFunction : AAKernelInfo { Attributor::SimplifictionCallbackTy StateMachineSimplifyCB = [&](const IRPosition &IRP, const AbstractAttribute *AA, - bool &UsedAssumedInformation) -> Optional<Value *> { + bool &UsedAssumedInformation) -> std::optional<Value *> { // IRP represents the "use generic state machine" argument of an // __kmpc_target_init call. We will answer this one with the internal // state. As long as we are not in an invalid state, we will create a @@ -3223,7 +3476,7 @@ struct AAKernelInfoFunction : AAKernelInfo { Attributor::SimplifictionCallbackTy ModeSimplifyCB = [&](const IRPosition &IRP, const AbstractAttribute *AA, - bool &UsedAssumedInformation) -> Optional<Value *> { + bool &UsedAssumedInformation) -> std::optional<Value *> { // IRP represents the "SPMDCompatibilityTracker" argument of an // __kmpc_target_init or // __kmpc_target_deinit call. We will answer this one with the internal @@ -3244,32 +3497,9 @@ struct AAKernelInfoFunction : AAKernelInfo { return Val; }; - Attributor::SimplifictionCallbackTy IsGenericModeSimplifyCB = - [&](const IRPosition &IRP, const AbstractAttribute *AA, - bool &UsedAssumedInformation) -> Optional<Value *> { - // IRP represents the "RequiresFullRuntime" argument of an - // __kmpc_target_init or __kmpc_target_deinit call. We will answer this - // one with the internal state of the SPMDCompatibilityTracker, so if - // generic then true, if SPMD then false. - if (!SPMDCompatibilityTracker.isValidState()) - return nullptr; - if (!SPMDCompatibilityTracker.isAtFixpoint()) { - if (AA) - A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); - UsedAssumedInformation = true; - } else { - UsedAssumedInformation = false; - } - auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(), - !SPMDCompatibilityTracker.isAssumed()); - return Val; - }; - constexpr const int InitModeArgNo = 1; constexpr const int DeinitModeArgNo = 1; constexpr const int InitUseStateMachineArgNo = 2; - constexpr const int InitRequiresFullRuntimeArgNo = 3; - constexpr const int DeinitRequiresFullRuntimeArgNo = 2; A.registerSimplificationCallback( IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo), StateMachineSimplifyCB); @@ -3279,14 +3509,6 @@ struct AAKernelInfoFunction : AAKernelInfo { A.registerSimplificationCallback( IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo), ModeSimplifyCB); - A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelInitCB, - InitRequiresFullRuntimeArgNo), - IsGenericModeSimplifyCB); - A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelDeinitCB, - DeinitRequiresFullRuntimeArgNo), - IsGenericModeSimplifyCB); // Check if we know we are in SPMD-mode already. ConstantInt *ModeArg = @@ -3296,6 +3518,84 @@ struct AAKernelInfoFunction : AAKernelInfo { // This is a generic region but SPMDization is disabled so stop tracking. else if (DisableOpenMPOptSPMDization) SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + + // Register virtual uses of functions we might need to preserve. + auto RegisterVirtualUse = [&](RuntimeFunction RFKind, + Attributor::VirtualUseCallbackTy &CB) { + if (!OMPInfoCache.RFIs[RFKind].Declaration) + return; + A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB); + }; + + // Add a dependence to ensure updates if the state changes. + auto AddDependence = [](Attributor &A, const AAKernelInfo *KI, + const AbstractAttribute *QueryingAA) { + if (QueryingAA) { + A.recordDependence(*KI, *QueryingAA, DepClassTy::OPTIONAL); + } + return true; + }; + + Attributor::VirtualUseCallbackTy CustomStateMachineUseCB = + [&](Attributor &A, const AbstractAttribute *QueryingAA) { + // Whenever we create a custom state machine we will insert calls to + // __kmpc_get_hardware_num_threads_in_block, + // __kmpc_get_warp_size, + // __kmpc_barrier_simple_generic, + // __kmpc_kernel_parallel, and + // __kmpc_kernel_end_parallel. + // Not needed if we are on track for SPMDzation. + if (SPMDCompatibilityTracker.isValidState()) + return AddDependence(A, this, QueryingAA); + // Not needed if we can't rewrite due to an invalid state. + if (!ReachedKnownParallelRegions.isValidState()) + return AddDependence(A, this, QueryingAA); + return false; + }; + + // Not needed if we are pre-runtime merge. + if (!KernelInitCB->getCalledFunction()->isDeclaration()) { + RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block, + CustomStateMachineUseCB); + RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB); + RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic, + CustomStateMachineUseCB); + RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel, + CustomStateMachineUseCB); + RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel, + CustomStateMachineUseCB); + } + + // If we do not perform SPMDzation we do not need the virtual uses below. + if (SPMDCompatibilityTracker.isAtFixpoint()) + return; + + Attributor::VirtualUseCallbackTy HWThreadIdUseCB = + [&](Attributor &A, const AbstractAttribute *QueryingAA) { + // Whenever we perform SPMDzation we will insert + // __kmpc_get_hardware_thread_id_in_block calls. + if (!SPMDCompatibilityTracker.isValidState()) + return AddDependence(A, this, QueryingAA); + return false; + }; + RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block, + HWThreadIdUseCB); + + Attributor::VirtualUseCallbackTy SPMDBarrierUseCB = + [&](Attributor &A, const AbstractAttribute *QueryingAA) { + // Whenever we perform SPMDzation with guarding we will insert + // __kmpc_simple_barrier_spmd calls. If SPMDzation failed, there is + // nothing to guard, or there are no parallel regions, we don't need + // the calls. + if (!SPMDCompatibilityTracker.isValidState()) + return AddDependence(A, this, QueryingAA); + if (SPMDCompatibilityTracker.empty()) + return AddDependence(A, this, QueryingAA); + if (!mayContainParallelRegion()) + return AddDependence(A, this, QueryingAA); + return false; + }; + RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB); } /// Sanitize the string \p S such that it is a suitable global symbol name. @@ -3318,77 +3618,29 @@ struct AAKernelInfoFunction : AAKernelInfo { if (!KernelInitCB || !KernelDeinitCB) return ChangeStatus::UNCHANGED; + /// Insert nested Parallelism global variable + Function *Kernel = getAnchorScope(); + Module &M = *Kernel->getParent(); + Type *Int8Ty = Type::getInt8Ty(M.getContext()); + new GlobalVariable(M, Int8Ty, /* isConstant */ true, + GlobalValue::WeakAnyLinkage, + ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0), + Kernel->getName() + "_nested_parallelism"); + // If we can we change the execution mode to SPMD-mode otherwise we build a // custom state machine. ChangeStatus Changed = ChangeStatus::UNCHANGED; - if (!changeToSPMDMode(A, Changed)) - return buildCustomStateMachine(A); + if (!changeToSPMDMode(A, Changed)) { + if (!KernelInitCB->getCalledFunction()->isDeclaration()) + return buildCustomStateMachine(A); + } return Changed; } - bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) { - if (!mayContainParallelRegion()) - return false; - + void insertInstructionGuardsHelper(Attributor &A) { auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); - if (!SPMDCompatibilityTracker.isAssumed()) { - for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) { - if (!NonCompatibleI) - continue; - - // Skip diagnostics on calls to known OpenMP runtime functions for now. - if (auto *CB = dyn_cast<CallBase>(NonCompatibleI)) - if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction())) - continue; - - auto Remark = [&](OptimizationRemarkAnalysis ORA) { - ORA << "Value has potential side effects preventing SPMD-mode " - "execution"; - if (isa<CallBase>(NonCompatibleI)) { - ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to " - "the called function to override"; - } - return ORA << "."; - }; - A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121", - Remark); - - LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: " - << *NonCompatibleI << "\n"); - } - - return false; - } - - // Get the actual kernel, could be the caller of the anchor scope if we have - // a debug wrapper. - Function *Kernel = getAnchorScope(); - if (Kernel->hasLocalLinkage()) { - assert(Kernel->hasOneUse() && "Unexpected use of debug kernel wrapper."); - auto *CB = cast<CallBase>(Kernel->user_back()); - Kernel = CB->getCaller(); - } - assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!"); - - // Check if the kernel is already in SPMD mode, if so, return success. - GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( - (Kernel->getName() + "_exec_mode").str()); - assert(ExecMode && "Kernel without exec mode?"); - assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!"); - - // Set the global exec mode flag to indicate SPMD-Generic mode. - assert(isa<ConstantInt>(ExecMode->getInitializer()) && - "ExecMode is not an integer!"); - const int8_t ExecModeVal = - cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue(); - if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC) - return true; - - // We will now unconditionally modify the IR, indicate a change. - Changed = ChangeStatus::CHANGED; - auto CreateGuardedRegion = [&](Instruction *RegionStartI, Instruction *RegionEndI) { LoopInfo *LI = nullptr; @@ -3605,6 +3857,125 @@ struct AAKernelInfoFunction : AAKernelInfo { for (auto &GR : GuardedRegions) CreateGuardedRegion(GR.first, GR.second); + } + + void forceSingleThreadPerWorkgroupHelper(Attributor &A) { + // Only allow 1 thread per workgroup to continue executing the user code. + // + // InitCB = __kmpc_target_init(...) + // ThreadIdInBlock = __kmpc_get_hardware_thread_id_in_block(); + // if (ThreadIdInBlock != 0) return; + // UserCode: + // // user code + // + auto &Ctx = getAnchorValue().getContext(); + Function *Kernel = getAssociatedFunction(); + assert(Kernel && "Expected an associated function!"); + + // Create block for user code to branch to from initial block. + BasicBlock *InitBB = KernelInitCB->getParent(); + BasicBlock *UserCodeBB = InitBB->splitBasicBlock( + KernelInitCB->getNextNode(), "main.thread.user_code"); + BasicBlock *ReturnBB = + BasicBlock::Create(Ctx, "exit.threads", Kernel, UserCodeBB); + + // Register blocks with attributor: + A.registerManifestAddedBasicBlock(*InitBB); + A.registerManifestAddedBasicBlock(*UserCodeBB); + A.registerManifestAddedBasicBlock(*ReturnBB); + + // Debug location: + const DebugLoc &DLoc = KernelInitCB->getDebugLoc(); + ReturnInst::Create(Ctx, ReturnBB)->setDebugLoc(DLoc); + InitBB->getTerminator()->eraseFromParent(); + + // Prepare call to OMPRTL___kmpc_get_hardware_thread_id_in_block. + Module &M = *Kernel->getParent(); + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); + FunctionCallee ThreadIdInBlockFn = + OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_get_hardware_thread_id_in_block); + + // Get thread ID in block. + CallInst *ThreadIdInBlock = + CallInst::Create(ThreadIdInBlockFn, "thread_id.in.block", InitBB); + OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock); + ThreadIdInBlock->setDebugLoc(DLoc); + + // Eliminate all threads in the block with ID not equal to 0: + Instruction *IsMainThread = + ICmpInst::Create(ICmpInst::ICmp, CmpInst::ICMP_NE, ThreadIdInBlock, + ConstantInt::get(ThreadIdInBlock->getType(), 0), + "thread.is_main", InitBB); + IsMainThread->setDebugLoc(DLoc); + BranchInst::Create(ReturnBB, UserCodeBB, IsMainThread, InitBB); + } + + bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) { + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); + + if (!SPMDCompatibilityTracker.isAssumed()) { + for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) { + if (!NonCompatibleI) + continue; + + // Skip diagnostics on calls to known OpenMP runtime functions for now. + if (auto *CB = dyn_cast<CallBase>(NonCompatibleI)) + if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction())) + continue; + + auto Remark = [&](OptimizationRemarkAnalysis ORA) { + ORA << "Value has potential side effects preventing SPMD-mode " + "execution"; + if (isa<CallBase>(NonCompatibleI)) { + ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to " + "the called function to override"; + } + return ORA << "."; + }; + A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121", + Remark); + + LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: " + << *NonCompatibleI << "\n"); + } + + return false; + } + + // Get the actual kernel, could be the caller of the anchor scope if we have + // a debug wrapper. + Function *Kernel = getAnchorScope(); + if (Kernel->hasLocalLinkage()) { + assert(Kernel->hasOneUse() && "Unexpected use of debug kernel wrapper."); + auto *CB = cast<CallBase>(Kernel->user_back()); + Kernel = CB->getCaller(); + } + assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!"); + + // Check if the kernel is already in SPMD mode, if so, return success. + GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( + (Kernel->getName() + "_exec_mode").str()); + assert(ExecMode && "Kernel without exec mode?"); + assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!"); + + // Set the global exec mode flag to indicate SPMD-Generic mode. + assert(isa<ConstantInt>(ExecMode->getInitializer()) && + "ExecMode is not an integer!"); + const int8_t ExecModeVal = + cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue(); + if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC) + return true; + + // We will now unconditionally modify the IR, indicate a change. + Changed = ChangeStatus::CHANGED; + + // Do not use instruction guards when no parallel is present inside + // the target region. + if (mayContainParallelRegion()) + insertInstructionGuardsHelper(A); + else + forceSingleThreadPerWorkgroupHelper(A); // Adjust the global exec mode flag that tells the runtime what mode this // kernel is executed in. @@ -3618,8 +3989,6 @@ struct AAKernelInfoFunction : AAKernelInfo { const int InitModeArgNo = 1; const int DeinitModeArgNo = 1; const int InitUseStateMachineArgNo = 2; - const int InitRequiresFullRuntimeArgNo = 3; - const int DeinitRequiresFullRuntimeArgNo = 2; auto &Ctx = getAnchorValue().getContext(); A.changeUseAfterManifest( @@ -3633,12 +4002,6 @@ struct AAKernelInfoFunction : AAKernelInfo { KernelDeinitCB->getArgOperandUse(DeinitModeArgNo), *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD)); - A.changeUseAfterManifest( - KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo), - *ConstantInt::getBool(Ctx, false)); - A.changeUseAfterManifest( - KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo), - *ConstantInt::getBool(Ctx, false)); ++NumOpenMPTargetRegionKernelsSPMD; @@ -3982,23 +4345,21 @@ struct AAKernelInfoFunction : AAKernelInfo { if (!I.mayWriteToMemory()) return true; if (auto *SI = dyn_cast<StoreInst>(&I)) { - SmallVector<const Value *> Objects; - getUnderlyingObjects(SI->getPointerOperand(), Objects); - if (llvm::all_of(Objects, - [](const Value *Obj) { return isa<AllocaInst>(Obj); })) - return true; - // Check for AAHeapToStack moved objects which must not be guarded. + const auto &UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>( + *this, IRPosition::value(*SI->getPointerOperand()), + DepClassTy::OPTIONAL); auto &HS = A.getAAFor<AAHeapToStack>( *this, IRPosition::function(*I.getFunction()), DepClassTy::OPTIONAL); - if (llvm::all_of(Objects, [&HS](const Value *Obj) { - auto *CB = dyn_cast<CallBase>(Obj); - if (!CB) - return false; - return HS.isAssumedHeapToStack(*CB); - })) { + if (UnderlyingObjsAA.forallUnderlyingObjects([&](Value &Obj) { + if (AA::isAssumedThreadLocalObject(A, Obj, *this)) + return true; + // Check for AAHeapToStack moved objects which must not be + // guarded. + auto *CB = dyn_cast<CallBase>(&Obj); + return CB && HS.isAssumedHeapToStack(*CB); + })) return true; - } } // Insert instruction that needs guarding. @@ -4020,28 +4381,30 @@ struct AAKernelInfoFunction : AAKernelInfo { updateReachingKernelEntries(A, AllReachingKernelsKnown); UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown; - if (!ParallelLevels.isValidState()) - SPMDCompatibilityTracker.indicatePessimisticFixpoint(); - else if (!ReachingKernelEntries.isValidState()) - SPMDCompatibilityTracker.indicatePessimisticFixpoint(); - else if (!SPMDCompatibilityTracker.empty()) { - // Check if all reaching kernels agree on the mode as we can otherwise - // not guard instructions. We might not be sure about the mode so we - // we cannot fix the internal spmd-zation state either. - int SPMD = 0, Generic = 0; - for (auto *Kernel : ReachingKernelEntries) { - auto &CBAA = A.getAAFor<AAKernelInfo>( - *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL); - if (CBAA.SPMDCompatibilityTracker.isValidState() && - CBAA.SPMDCompatibilityTracker.isAssumed()) - ++SPMD; - else - ++Generic; - if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint()) - UsedAssumedInformationFromReachingKernels = true; - } - if (SPMD != 0 && Generic != 0) + if (!SPMDCompatibilityTracker.empty()) { + if (!ParallelLevels.isValidState()) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + else if (!ReachingKernelEntries.isValidState()) SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + else { + // Check if all reaching kernels agree on the mode as we can otherwise + // not guard instructions. We might not be sure about the mode so we + // we cannot fix the internal spmd-zation state either. + int SPMD = 0, Generic = 0; + for (auto *Kernel : ReachingKernelEntries) { + auto &CBAA = A.getAAFor<AAKernelInfo>( + *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL); + if (CBAA.SPMDCompatibilityTracker.isValidState() && + CBAA.SPMDCompatibilityTracker.isAssumed()) + ++SPMD; + else + ++Generic; + if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint()) + UsedAssumedInformationFromReachingKernels = true; + } + if (SPMD != 0 && Generic != 0) + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + } } } @@ -4077,15 +4440,6 @@ struct AAKernelInfoFunction : AAKernelInfo { ReachedUnknownParallelRegions.indicateOptimisticFixpoint(); } - // If we are sure there are no parallel regions in the kernel we do not - // want SPMD mode. - if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() && - ReachedKnownParallelRegions.isAtFixpoint() && - ReachedUnknownParallelRegions.isValidState() && - ReachedKnownParallelRegions.isValidState() && - !mayContainParallelRegion()) - SPMDCompatibilityTracker.indicatePessimisticFixpoint(); - // If we haven't used any assumed information for the SPMD state we can fix // it. if (!UsedAssumedInformationInCheckRWInst && @@ -4288,6 +4642,12 @@ struct AAKernelInfoCallSite : AAKernelInfo { if (auto *ParallelRegion = dyn_cast<Function>( CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) { ReachedKnownParallelRegions.insert(ParallelRegion); + /// Check nested parallelism + auto &FnAA = A.getAAFor<AAKernelInfo>( + *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL); + NestedParallelism |= !FnAA.getState().isValidState() || + !FnAA.ReachedKnownParallelRegions.empty() || + !FnAA.ReachedUnknownParallelRegions.empty(); break; } // The condition above should usually get the parallel region function @@ -4419,10 +4779,10 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { if (!SimplifiedValue) return Str + std::string("none"); - if (!SimplifiedValue.value()) + if (!*SimplifiedValue) return Str + std::string("nullptr"); - if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.value())) + if (ConstantInt *CI = dyn_cast<ConstantInt>(*SimplifiedValue)) return Str + std::to_string(CI->getSExtValue()); return Str + std::string("unknown"); @@ -4445,9 +4805,9 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { A.registerSimplificationCallback( IRPosition::callsite_returned(CB), [&](const IRPosition &IRP, const AbstractAttribute *AA, - bool &UsedAssumedInformation) -> Optional<Value *> { + bool &UsedAssumedInformation) -> std::optional<Value *> { assert((isValidState() || - (SimplifiedValue && SimplifiedValue.value() == nullptr)) && + (SimplifiedValue && *SimplifiedValue == nullptr)) && "Unexpected invalid state!"); if (!isAtFixpoint()) { @@ -4465,9 +4825,6 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { case OMPRTL___kmpc_is_spmd_exec_mode: Changed |= foldIsSPMDExecMode(A); break; - case OMPRTL___kmpc_is_generic_main_thread_id: - Changed |= foldIsGenericMainThread(A); - break; case OMPRTL___kmpc_parallel_level: Changed |= foldParallelLevel(A); break; @@ -4522,7 +4879,7 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { private: /// Fold __kmpc_is_spmd_exec_mode into a constant if possible. ChangeStatus foldIsSPMDExecMode(Attributor &A) { - Optional<Value *> SimplifiedValueBefore = SimplifiedValue; + std::optional<Value *> SimplifiedValueBefore = SimplifiedValue; unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0; unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0; @@ -4582,31 +4939,9 @@ private: : ChangeStatus::CHANGED; } - /// Fold __kmpc_is_generic_main_thread_id into a constant if possible. - ChangeStatus foldIsGenericMainThread(Attributor &A) { - Optional<Value *> SimplifiedValueBefore = SimplifiedValue; - - CallBase &CB = cast<CallBase>(getAssociatedValue()); - Function *F = CB.getFunction(); - const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>( - *this, IRPosition::function(*F), DepClassTy::REQUIRED); - - if (!ExecutionDomainAA.isValidState()) - return indicatePessimisticFixpoint(); - - auto &Ctx = getAnchorValue().getContext(); - if (ExecutionDomainAA.isExecutedByInitialThreadOnly(CB)) - SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true); - else - return indicatePessimisticFixpoint(); - - return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED - : ChangeStatus::CHANGED; - } - /// Fold __kmpc_parallel_level into a constant if possible. ChangeStatus foldParallelLevel(Attributor &A) { - Optional<Value *> SimplifiedValueBefore = SimplifiedValue; + std::optional<Value *> SimplifiedValueBefore = SimplifiedValue; auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); @@ -4668,7 +5003,7 @@ private: ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) { // Specialize only if all the calls agree with the attribute constant value int32_t CurrentAttrValue = -1; - Optional<Value *> SimplifiedValueBefore = SimplifiedValue; + std::optional<Value *> SimplifiedValueBefore = SimplifiedValue; auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); @@ -4678,10 +5013,7 @@ private: // Iterate over the kernels that reach this function for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { - int32_t NextAttrVal = -1; - if (K->hasFnAttribute(Attr)) - NextAttrVal = - std::stoi(K->getFnAttribute(Attr).getValueAsString().str()); + int32_t NextAttrVal = K->getFnAttributeAsParsedInteger(Attr, -1); if (NextAttrVal == -1 || (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal)) @@ -4701,7 +5033,7 @@ private: /// An optional value the associated value is assumed to fold to. That is, we /// assume the associated value (which is a call) can be replaced by this /// simplified value. - Optional<Value *> SimplifiedValue; + std::optional<Value *> SimplifiedValue; /// The runtime function kind of the callee of the associated call site. RuntimeFunction RFKind; @@ -4744,7 +5076,6 @@ void OpenMPOpt::registerAAs(bool IsModulePass) { OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; InitRFI.foreachUse(SCC, CreateKernelInfoCB); - registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id); registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode); registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level); registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block); @@ -4752,32 +5083,27 @@ void OpenMPOpt::registerAAs(bool IsModulePass) { } // Create CallSite AA for all Getters. - for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) { - auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)]; + if (DeduceICVValues) { + for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) { + auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)]; - auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter]; + auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter]; - auto CreateAA = [&](Use &U, Function &Caller) { - CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI); - if (!CI) - return false; + auto CreateAA = [&](Use &U, Function &Caller) { + CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI); + if (!CI) + return false; - auto &CB = cast<CallBase>(*CI); + auto &CB = cast<CallBase>(*CI); - IRPosition CBPos = IRPosition::callsite_function(CB); - A.getOrCreateAAFor<AAICVTracker>(CBPos); - return false; - }; + IRPosition CBPos = IRPosition::callsite_function(CB); + A.getOrCreateAAFor<AAICVTracker>(CBPos); + return false; + }; - GetterRFI.foreachUse(SCC, CreateAA); + GetterRFI.foreachUse(SCC, CreateAA); + } } - auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; - auto CreateAA = [&](Use &U, Function &F) { - A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F)); - return false; - }; - if (!DisableOpenMPOptDeglobalization) - GlobalizationRFI.foreachUse(SCC, CreateAA); // Create an ExecutionDomain AA for every function and a HeapToStack AA for // every function if there is a device kernel. @@ -4788,17 +5114,44 @@ void OpenMPOpt::registerAAs(bool IsModulePass) { if (F->isDeclaration()) continue; - A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F)); - if (!DisableOpenMPOptDeglobalization) - A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F)); + // We look at internal functions only on-demand but if any use is not a + // direct call or outside the current set of analyzed functions, we have + // to do it eagerly. + if (F->hasLocalLinkage()) { + if (llvm::all_of(F->uses(), [this](const Use &U) { + const auto *CB = dyn_cast<CallBase>(U.getUser()); + return CB && CB->isCallee(&U) && + A.isRunOn(const_cast<Function *>(CB->getCaller())); + })) + continue; + } + registerAAsForFunction(A, *F); + } +} - for (auto &I : instructions(*F)) { - if (auto *LI = dyn_cast<LoadInst>(&I)) { - bool UsedAssumedInformation = false; - A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr, - UsedAssumedInformation, AA::Interprocedural); - } else if (auto *SI = dyn_cast<StoreInst>(&I)) { - A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI)); +void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) { + if (!DisableOpenMPOptDeglobalization) + A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F)); + A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F)); + if (!DisableOpenMPOptDeglobalization) + A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(F)); + + for (auto &I : instructions(F)) { + if (auto *LI = dyn_cast<LoadInst>(&I)) { + bool UsedAssumedInformation = false; + A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr, + UsedAssumedInformation, AA::Interprocedural); + continue; + } + if (auto *SI = dyn_cast<StoreInst>(&I)) { + A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI)); + continue; + } + if (auto *II = dyn_cast<IntrinsicInst>(&I)) { + if (II->getIntrinsicID() == Intrinsic::assume) { + A.getOrCreateAAFor<AAPotentialValues>( + IRPosition::value(*II->getArgOperand(0))); + continue; } } } @@ -4970,10 +5323,13 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { } // Look at every function in the Module unless it was internalized. + SetVector<Function *> Functions; SmallVector<Function *, 16> SCC; for (Function &F : M) - if (!F.isDeclaration() && !InternalizedMap.lookup(&F)) + if (!F.isDeclaration() && !InternalizedMap.lookup(&F)) { SCC.push_back(&F); + Functions.insert(&F); + } if (SCC.empty()) return PreservedAnalyses::all(); @@ -4987,18 +5343,19 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { BumpPtrAllocator Allocator; CallGraphUpdater CGUpdater; - SetVector<Function *> Functions(SCC.begin(), SCC.end()); - OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels); + OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, Kernels); unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? SetFixpointIterations : 32; AttributorConfig AC(CGUpdater); AC.DefaultInitializeLiveInternals = false; + AC.IsModulePass = true; AC.RewriteSignatures = false; AC.MaxFixpointIterations = MaxFixpointIterations; AC.OREGetter = OREGetter; AC.PassName = DEBUG_TYPE; + AC.InitializationCallback = OpenMPOpt::registerAAsForFunction; Attributor A(Functions, InfoCache, AC); @@ -5062,7 +5419,7 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, SetVector<Function *> Functions(SCC.begin(), SCC.end()); OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, - /*CGSCC*/ Functions, Kernels); + /*CGSCC*/ &Functions, Kernels); unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? SetFixpointIterations : 32; @@ -5074,6 +5431,7 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, AC.MaxFixpointIterations = MaxFixpointIterations; AC.OREGetter = OREGetter; AC.PassName = DEBUG_TYPE; + AC.InitializationCallback = OpenMPOpt::registerAAsForFunction; Attributor A(Functions, InfoCache, AC); @@ -5089,90 +5447,9 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, return PreservedAnalyses::all(); } -namespace { - -struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass { - CallGraphUpdater CGUpdater; - static char ID; - - OpenMPOptCGSCCLegacyPass() : CallGraphSCCPass(ID) { - initializeOpenMPOptCGSCCLegacyPassPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - CallGraphSCCPass::getAnalysisUsage(AU); - } - - bool runOnSCC(CallGraphSCC &CGSCC) override { - if (!containsOpenMP(CGSCC.getCallGraph().getModule())) - return false; - if (DisableOpenMPOptimizations || skipSCC(CGSCC)) - return false; - - SmallVector<Function *, 16> SCC; - // If there are kernels in the module, we have to run on all SCC's. - for (CallGraphNode *CGN : CGSCC) { - Function *Fn = CGN->getFunction(); - if (!Fn || Fn->isDeclaration()) - continue; - SCC.push_back(Fn); - } - - if (SCC.empty()) - return false; - - Module &M = CGSCC.getCallGraph().getModule(); - KernelSet Kernels = getDeviceKernels(M); - - CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); - CGUpdater.initialize(CG, CGSCC); - - // Maintain a map of functions to avoid rebuilding the ORE - DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap; - auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & { - std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F]; - if (!ORE) - ORE = std::make_unique<OptimizationRemarkEmitter>(F); - return *ORE; - }; - - AnalysisGetter AG; - SetVector<Function *> Functions(SCC.begin(), SCC.end()); - BumpPtrAllocator Allocator; - OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, - Allocator, - /*CGSCC*/ Functions, Kernels); - - unsigned MaxFixpointIterations = - (isOpenMPDevice(M)) ? SetFixpointIterations : 32; - - AttributorConfig AC(CGUpdater); - AC.DefaultInitializeLiveInternals = false; - AC.IsModulePass = false; - AC.RewriteSignatures = false; - AC.MaxFixpointIterations = MaxFixpointIterations; - AC.OREGetter = OREGetter; - AC.PassName = DEBUG_TYPE; - - Attributor A(Functions, InfoCache, AC); - - OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); - bool Result = OMPOpt.run(false); - - if (PrintModuleAfterOptimizations) - LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M); - - return Result; - } - - bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); } -}; - -} // end anonymous namespace - KernelSet llvm::omp::getDeviceKernels(Module &M) { // TODO: Create a more cross-platform way of determining device kernels. - NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); + NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"); KernelSet Kernels; if (!MD) @@ -5213,15 +5490,3 @@ bool llvm::omp::isOpenMPDevice(Module &M) { return true; } - -char OpenMPOptCGSCCLegacyPass::ID = 0; - -INITIALIZE_PASS_BEGIN(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc", - "OpenMP specific optimizations", false, false) -INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_END(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc", - "OpenMP specific optimizations", false, false) - -Pass *llvm::createOpenMPOptCGSCCLegacyPass() { - return new OpenMPOptCGSCCLegacyPass(); -} |
