diff options
Diffstat (limited to 'llvm/lib/Transforms/IPO/OpenMPOpt.cpp')
-rw-r--r-- | llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 699 |
1 files changed, 412 insertions, 287 deletions
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index bee154dab10f..588f3901e3cb 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -22,8 +22,10 @@ #include "llvm/ADT/EnumeratedArray.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" @@ -36,6 +38,8 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instruction.h" @@ -44,7 +48,7 @@ #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/InitializePasses.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/IPO/Attributor.h" @@ -188,9 +192,9 @@ struct AAICVTracker; struct OMPInformationCache : public InformationCache { OMPInformationCache(Module &M, AnalysisGetter &AG, BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC, - KernelSet &Kernels) + bool OpenMPPostLink) : InformationCache(M, AG, Allocator, CGSCC), OMPBuilder(M), - Kernels(Kernels) { + OpenMPPostLink(OpenMPPostLink) { OMPBuilder.initialize(); initializeRuntimeFunctions(M); @@ -417,7 +421,7 @@ struct OMPInformationCache : public InformationCache { // TODO: We directly convert uses into proper calls and unknown uses. for (Use &U : RFI.Declaration->uses()) { if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) { - if (ModuleSlice.empty() || ModuleSlice.count(UserI->getFunction())) { + if (!CGSCC || CGSCC->empty() || CGSCC->contains(UserI->getFunction())) { RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); ++NumUses; } @@ -448,6 +452,24 @@ struct OMPInformationCache : public InformationCache { CI->setCallingConv(Fn->getCallingConv()); } + // Helper function to determine if it's legal to create a call to the runtime + // functions. + bool runtimeFnsAvailable(ArrayRef<RuntimeFunction> Fns) { + // We can always emit calls if we haven't yet linked in the runtime. + if (!OpenMPPostLink) + return true; + + // Once the runtime has been already been linked in we cannot emit calls to + // any undefined functions. + for (RuntimeFunction Fn : Fns) { + RuntimeFunctionInfo &RFI = RFIs[Fn]; + + if (RFI.Declaration && RFI.Declaration->isDeclaration()) + return false; + } + return true; + } + /// Helper to initialize all runtime function information for those defined /// in OpenMPKinds.def. void initializeRuntimeFunctions(Module &M) { @@ -518,11 +540,11 @@ struct OMPInformationCache : public InformationCache { // TODO: We should attach the attributes defined in OMPKinds.def. } - /// Collection of known kernels (\see Kernel) in the module. - KernelSet &Kernels; - /// Collection of known OpenMP runtime functions.. DenseSet<const Function *> RTLFunctions; + + /// Indicates if we have already linked in the OpenMP device library. + bool OpenMPPostLink = false; }; template <typename Ty, bool InsertInvalidates = true> @@ -808,7 +830,7 @@ struct OpenMPOpt { return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE); } - /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. + /// Run all OpenMP optimizations on the underlying SCC. bool run(bool IsModulePass) { if (SCC.empty()) return false; @@ -816,8 +838,7 @@ struct OpenMPOpt { bool Changed = false; LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() - << " functions in a slice with " - << OMPInfoCache.ModuleSlice.size() << " functions\n"); + << " functions\n"); if (IsModulePass) { Changed |= runAttributor(IsModulePass); @@ -882,7 +903,7 @@ struct OpenMPOpt { /// Print OpenMP GPU kernels for testing. void printKernels() const { for (Function *F : SCC) { - if (!OMPInfoCache.Kernels.count(F)) + if (!omp::isKernel(*F)) continue; auto Remark = [&](OptimizationRemarkAnalysis ORA) { @@ -1412,7 +1433,10 @@ private: Changed |= WasSplit; return WasSplit; }; - RFI.foreachUse(SCC, SplitMemTransfers); + if (OMPInfoCache.runtimeFnsAvailable( + {OMPRTL___tgt_target_data_begin_mapper_issue, + OMPRTL___tgt_target_data_begin_mapper_wait})) + RFI.foreachUse(SCC, SplitMemTransfers); return Changed; } @@ -1681,37 +1705,27 @@ private: }; if (!ReplVal) { - for (Use *U : *UV) + auto *DT = + OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(F); + if (!DT) + return false; + Instruction *IP = nullptr; + for (Use *U : *UV) { if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) { + if (IP) + IP = DT->findNearestCommonDominator(IP, CI); + else + IP = CI; if (!CanBeMoved(*CI)) continue; - - // If the function is a kernel, dedup will move - // the runtime call right after the kernel init callsite. Otherwise, - // it will move it to the beginning of the caller function. - if (isKernel(F)) { - auto &KernelInitRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; - auto *KernelInitUV = KernelInitRFI.getUseVector(F); - - if (KernelInitUV->empty()) - continue; - - assert(KernelInitUV->size() == 1 && - "Expected a single __kmpc_target_init in kernel\n"); - - CallInst *KernelInitCI = - getCallIfRegularCall(*KernelInitUV->front(), &KernelInitRFI); - assert(KernelInitCI && - "Expected a call to __kmpc_target_init in kernel\n"); - - CI->moveAfter(KernelInitCI); - } else - CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); - ReplVal = CI; - break; + if (!ReplVal) + ReplVal = CI; } + } if (!ReplVal) return false; + assert(IP && "Expected insertion point!"); + cast<Instruction>(ReplVal)->moveBefore(IP); } // If we use a call as a replacement value we need to make sure the ident is @@ -1809,9 +1823,6 @@ private: /// ///{{ - /// Check if \p F is a kernel, hence entry point for target offloading. - bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); } - /// Cache to remember the unique kernel for a function. DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap; @@ -1920,7 +1931,8 @@ public: }; Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { - if (!OMPInfoCache.ModuleSlice.empty() && !OMPInfoCache.ModuleSlice.count(&F)) + if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() && + !OMPInfoCache.CGSCC->contains(&F)) return nullptr; // Use a scope to keep the lifetime of the CachedKernel short. @@ -2095,12 +2107,6 @@ struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> { using Base = StateWrapper<BooleanState, AbstractAttribute>; AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {} - void initialize(Attributor &A) override { - Function *F = getAnchorScope(); - if (!F || !A.isFunctionIPOAmendable(*F)) - indicatePessimisticFixpoint(); - } - /// Returns true if value is assumed to be tracked. bool isAssumedTracked() const { return getAssumed(); } @@ -2146,7 +2152,9 @@ struct AAICVTrackerFunction : public AAICVTracker { : AAICVTracker(IRP, A) {} // FIXME: come up with better string. - const std::string getAsStr() const override { return "ICVTrackerFunction"; } + const std::string getAsStr(Attributor *) const override { + return "ICVTrackerFunction"; + } // FIXME: come up with some stats. void trackStatistics() const override {} @@ -2242,11 +2250,12 @@ struct AAICVTrackerFunction : public AAICVTracker { if (CalledFunction->isDeclaration()) return nullptr; - const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( + const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>( *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED); - if (ICVTrackingAA.isAssumedTracked()) { - std::optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV); + if (ICVTrackingAA->isAssumedTracked()) { + std::optional<Value *> URV = + ICVTrackingAA->getUniqueReplacementValue(ICV); if (!URV || (*URV && AA::isValidAtPosition(AA::ValueAndContext(**URV, I), OMPInfoCache))) return URV; @@ -2337,7 +2346,7 @@ struct AAICVTrackerFunctionReturned : AAICVTracker { : AAICVTracker(IRP, A) {} // FIXME: come up with better string. - const std::string getAsStr() const override { + const std::string getAsStr(Attributor *) const override { return "ICVTrackerFunctionReturned"; } @@ -2362,10 +2371,10 @@ struct AAICVTrackerFunctionReturned : AAICVTracker { ChangeStatus updateImpl(Attributor &A) override { ChangeStatus Changed = ChangeStatus::UNCHANGED; - const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( + const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>( *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); - if (!ICVTrackingAA.isAssumedTracked()) + if (!ICVTrackingAA->isAssumedTracked()) return indicatePessimisticFixpoint(); for (InternalControlVar ICV : TrackableICVs) { @@ -2374,7 +2383,7 @@ struct AAICVTrackerFunctionReturned : AAICVTracker { auto CheckReturnInst = [&](Instruction &I) { std::optional<Value *> NewReplVal = - ICVTrackingAA.getReplacementValue(ICV, &I, A); + ICVTrackingAA->getReplacementValue(ICV, &I, A); // If we found a second ICV value there is no unique returned value. if (UniqueICVValue && UniqueICVValue != NewReplVal) @@ -2407,9 +2416,7 @@ struct AAICVTrackerCallSite : AAICVTracker { : AAICVTracker(IRP, A) {} void initialize(Attributor &A) override { - Function *F = getAnchorScope(); - if (!F || !A.isFunctionIPOAmendable(*F)) - indicatePessimisticFixpoint(); + assert(getAnchorScope() && "Expected anchor function"); // We only initialize this AA for getters, so we need to know which ICV it // gets. @@ -2438,7 +2445,9 @@ struct AAICVTrackerCallSite : AAICVTracker { } // FIXME: come up with better string. - const std::string getAsStr() const override { return "ICVTrackerCallSite"; } + const std::string getAsStr(Attributor *) const override { + return "ICVTrackerCallSite"; + } // FIXME: come up with some stats. void trackStatistics() const override {} @@ -2447,15 +2456,15 @@ struct AAICVTrackerCallSite : AAICVTracker { std::optional<Value *> ReplVal; ChangeStatus updateImpl(Attributor &A) override { - const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( + const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>( *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); // We don't have any information, so we assume it changes the ICV. - if (!ICVTrackingAA.isAssumedTracked()) + if (!ICVTrackingAA->isAssumedTracked()) return indicatePessimisticFixpoint(); std::optional<Value *> NewReplVal = - ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A); + ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(), A); if (ReplVal == NewReplVal) return ChangeStatus::UNCHANGED; @@ -2477,7 +2486,7 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker { : AAICVTracker(IRP, A) {} // FIXME: come up with better string. - const std::string getAsStr() const override { + const std::string getAsStr(Attributor *) const override { return "ICVTrackerCallSiteReturned"; } @@ -2503,18 +2512,18 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker { ChangeStatus updateImpl(Attributor &A) override { ChangeStatus Changed = ChangeStatus::UNCHANGED; - const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( + const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>( *this, IRPosition::returned(*getAssociatedFunction()), DepClassTy::REQUIRED); // We don't have any information, so we assume it changes the ICV. - if (!ICVTrackingAA.isAssumedTracked()) + if (!ICVTrackingAA->isAssumedTracked()) return indicatePessimisticFixpoint(); for (InternalControlVar ICV : TrackableICVs) { std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; std::optional<Value *> NewReplVal = - ICVTrackingAA.getUniqueReplacementValue(ICV); + ICVTrackingAA->getUniqueReplacementValue(ICV); if (ReplVal == NewReplVal) continue; @@ -2530,26 +2539,28 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A) : AAExecutionDomain(IRP, A) {} - ~AAExecutionDomainFunction() { - delete RPOT; - } + ~AAExecutionDomainFunction() { delete RPOT; } void initialize(Attributor &A) override { - if (getAnchorScope()->isDeclaration()) { - indicatePessimisticFixpoint(); - return; - } - RPOT = new ReversePostOrderTraversal<Function *>(getAnchorScope()); + Function *F = getAnchorScope(); + assert(F && "Expected anchor function"); + RPOT = new ReversePostOrderTraversal<Function *>(F); } - const std::string getAsStr() const override { - unsigned TotalBlocks = 0, InitialThreadBlocks = 0; + const std::string getAsStr(Attributor *) const override { + unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0; for (auto &It : BEDMap) { + if (!It.getFirst()) + continue; TotalBlocks++; InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly; + AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly && + It.getSecond().IsReachingAlignedBarrierOnly; } return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) + "/" + - std::to_string(TotalBlocks) + " executed by initial thread only"; + std::to_string(AlignedBlocks) + " of " + + std::to_string(TotalBlocks) + + " executed by initial thread / aligned"; } /// See AbstractAttribute::trackStatistics(). @@ -2572,7 +2583,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { SmallPtrSet<CallBase *, 16> DeletedBarriers; auto HandleAlignedBarrier = [&](CallBase *CB) { - const ExecutionDomainTy &ED = CEDMap[CB]; + const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[nullptr]; if (!ED.IsReachedFromAlignedBarrierOnly || ED.EncounteredNonLocalSideEffect) return; @@ -2596,6 +2607,8 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { CallBase *LastCB = Worklist.pop_back_val(); if (!Visited.insert(LastCB)) continue; + if (LastCB->getFunction() != getAnchorScope()) + continue; if (!DeletedBarriers.count(LastCB)) { A.deleteAfterManifest(*LastCB); continue; @@ -2603,7 +2616,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { // The final aligned barrier (LastCB) reaching the kernel end was // removed already. This means we can go one step further and remove // the barriers encoutered last before (LastCB). - const ExecutionDomainTy &LastED = CEDMap[LastCB]; + const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}]; Worklist.append(LastED.AlignedBarriers.begin(), LastED.AlignedBarriers.end()); } @@ -2619,14 +2632,17 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { for (auto *CB : AlignedBarriers) HandleAlignedBarrier(CB); - auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); // Handle the "kernel end barrier" for kernels too. - if (OMPInfoCache.Kernels.count(getAnchorScope())) + if (omp::isKernel(*getAnchorScope())) HandleAlignedBarrier(nullptr); return Changed; } + bool isNoOpFence(const FenceInst &FI) const override { + return getState().isValidState() && !NonNoOpFences.count(&FI); + } + /// Merge barrier and assumption information from \p PredED into the successor /// \p ED. void @@ -2636,12 +2652,12 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { /// Merge all information from \p PredED into the successor \p ED. If /// \p InitialEdgeOnly is set, only the initial edge will enter the block /// represented by \p ED from this predecessor. - void mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED, + bool mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED, bool InitialEdgeOnly = false); /// Accumulate information for the entry block in \p EntryBBED. - void handleEntryBB(Attributor &A, ExecutionDomainTy &EntryBBED); + bool handleCallees(Attributor &A, ExecutionDomainTy &EntryBBED); /// See AbstractAttribute::updateImpl. ChangeStatus updateImpl(Attributor &A) override; @@ -2651,14 +2667,18 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override { if (!isValidState()) return false; + assert(BB.getParent() == getAnchorScope() && "Block is out of scope!"); return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly; } bool isExecutedInAlignedRegion(Attributor &A, const Instruction &I) const override { - if (!isValidState() || isa<CallBase>(I)) + assert(I.getFunction() == getAnchorScope() && + "Instruction is out of scope!"); + if (!isValidState()) return false; + bool ForwardIsOk = true; const Instruction *CurI; // Check forward until a call or the block end is reached. @@ -2667,15 +2687,18 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { auto *CB = dyn_cast<CallBase>(CurI); if (!CB) continue; - const auto &It = CEDMap.find(CB); + if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB))) + return true; + const auto &It = CEDMap.find({CB, PRE}); if (It == CEDMap.end()) continue; - if (!It->getSecond().IsReachedFromAlignedBarrierOnly) - return false; + if (!It->getSecond().IsReachingAlignedBarrierOnly) + ForwardIsOk = false; + break; } while ((CurI = CurI->getNextNonDebugInstruction())); - if (!CurI && !BEDMap.lookup(I.getParent()).IsReachedFromAlignedBarrierOnly) - return false; + if (!CurI && !BEDMap.lookup(I.getParent()).IsReachingAlignedBarrierOnly) + ForwardIsOk = false; // Check backward until a call or the block beginning is reached. CurI = &I; @@ -2683,33 +2706,30 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { auto *CB = dyn_cast<CallBase>(CurI); if (!CB) continue; - const auto &It = CEDMap.find(CB); + if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB))) + return true; + const auto &It = CEDMap.find({CB, POST}); if (It == CEDMap.end()) continue; - if (!AA::isNoSyncInst(A, *CB, *this)) { - if (It->getSecond().IsReachedFromAlignedBarrierOnly) - break; - return false; - } - - Function *Callee = CB->getCalledFunction(); - if (!Callee || Callee->isDeclaration()) - return false; - const auto &EDAA = A.getAAFor<AAExecutionDomain>( - *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL); - if (!EDAA.getState().isValidState()) - return false; - if (!EDAA.getFunctionExecutionDomain().IsReachedFromAlignedBarrierOnly) - return false; - break; + if (It->getSecond().IsReachedFromAlignedBarrierOnly) + break; + return false; } while ((CurI = CurI->getPrevNonDebugInstruction())); - if (!CurI && - !llvm::all_of( - predecessors(I.getParent()), [&](const BasicBlock *PredBB) { - return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly; - })) { + // Delayed decision on the forward pass to allow aligned barrier detection + // in the backwards traversal. + if (!ForwardIsOk) return false; + + if (!CurI) { + const BasicBlock *BB = I.getParent(); + if (BB == &BB->getParent()->getEntryBlock()) + return BEDMap.lookup(nullptr).IsReachedFromAlignedBarrierOnly; + if (!llvm::all_of(predecessors(BB), [&](const BasicBlock *PredBB) { + return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly; + })) { + return false; + } } // On neither traversal we found a anything but aligned barriers. @@ -2721,15 +2741,16 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { "No request should be made against an invalid state!"); return BEDMap.lookup(&BB); } - ExecutionDomainTy getExecutionDomain(const CallBase &CB) const override { + std::pair<ExecutionDomainTy, ExecutionDomainTy> + getExecutionDomain(const CallBase &CB) const override { assert(isValidState() && "No request should be made against an invalid state!"); - return CEDMap.lookup(&CB); + return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})}; } ExecutionDomainTy getFunctionExecutionDomain() const override { assert(isValidState() && "No request should be made against an invalid state!"); - return BEDMap.lookup(nullptr); + return InterProceduralED; } ///} @@ -2778,12 +2799,28 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { return false; }; + /// Mapping containing information about the function for other AAs. + ExecutionDomainTy InterProceduralED; + + enum Direction { PRE = 0, POST = 1 }; /// Mapping containing information per block. DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap; - DenseMap<const CallBase *, ExecutionDomainTy> CEDMap; + DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy> + CEDMap; SmallSetVector<CallBase *, 16> AlignedBarriers; ReversePostOrderTraversal<Function *> *RPOT = nullptr; + + /// Set \p R to \V and report true if that changed \p R. + static bool setAndRecord(bool &R, bool V) { + bool Eq = (R == V); + R = V; + return !Eq; + } + + /// Collection of fences known to be non-no-opt. All fences not in this set + /// can be assumed no-opt. + SmallPtrSet<const FenceInst *, 8> NonNoOpFences; }; void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions( @@ -2795,62 +2832,82 @@ void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions( ED.addAlignedBarrier(A, *AB); } -void AAExecutionDomainFunction::mergeInPredecessor( +bool AAExecutionDomainFunction::mergeInPredecessor( Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED, bool InitialEdgeOnly) { - ED.IsExecutedByInitialThreadOnly = - InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly && - ED.IsExecutedByInitialThreadOnly); - - ED.IsReachedFromAlignedBarrierOnly = ED.IsReachedFromAlignedBarrierOnly && - PredED.IsReachedFromAlignedBarrierOnly; - ED.EncounteredNonLocalSideEffect = - ED.EncounteredNonLocalSideEffect | PredED.EncounteredNonLocalSideEffect; + + bool Changed = false; + Changed |= + setAndRecord(ED.IsExecutedByInitialThreadOnly, + InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly && + ED.IsExecutedByInitialThreadOnly)); + + Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly, + ED.IsReachedFromAlignedBarrierOnly && + PredED.IsReachedFromAlignedBarrierOnly); + Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect, + ED.EncounteredNonLocalSideEffect | + PredED.EncounteredNonLocalSideEffect); + // Do not track assumptions and barriers as part of Changed. if (ED.IsReachedFromAlignedBarrierOnly) mergeInPredecessorBarriersAndAssumptions(A, ED, PredED); else ED.clearAssumeInstAndAlignedBarriers(); + return Changed; } -void AAExecutionDomainFunction::handleEntryBB(Attributor &A, +bool AAExecutionDomainFunction::handleCallees(Attributor &A, ExecutionDomainTy &EntryBBED) { - SmallVector<ExecutionDomainTy> PredExecDomains; + SmallVector<std::pair<ExecutionDomainTy, ExecutionDomainTy>, 4> CallSiteEDs; auto PredForCallSite = [&](AbstractCallSite ACS) { - const auto &EDAA = A.getAAFor<AAExecutionDomain>( + const auto *EDAA = A.getAAFor<AAExecutionDomain>( *this, IRPosition::function(*ACS.getInstruction()->getFunction()), DepClassTy::OPTIONAL); - if (!EDAA.getState().isValidState()) + if (!EDAA || !EDAA->getState().isValidState()) return false; - PredExecDomains.emplace_back( - EDAA.getExecutionDomain(*cast<CallBase>(ACS.getInstruction()))); + CallSiteEDs.emplace_back( + EDAA->getExecutionDomain(*cast<CallBase>(ACS.getInstruction()))); return true; }; + ExecutionDomainTy ExitED; bool AllCallSitesKnown; if (A.checkForAllCallSites(PredForCallSite, *this, /* RequiresAllCallSites */ true, AllCallSitesKnown)) { - for (const auto &PredED : PredExecDomains) - mergeInPredecessor(A, EntryBBED, PredED); + for (const auto &[CSInED, CSOutED] : CallSiteEDs) { + mergeInPredecessor(A, EntryBBED, CSInED); + ExitED.IsReachingAlignedBarrierOnly &= + CSOutED.IsReachingAlignedBarrierOnly; + } } else { // We could not find all predecessors, so this is either a kernel or a // function with external linkage (or with some other weird uses). - auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); - if (OMPInfoCache.Kernels.count(getAnchorScope())) { + if (omp::isKernel(*getAnchorScope())) { EntryBBED.IsExecutedByInitialThreadOnly = false; EntryBBED.IsReachedFromAlignedBarrierOnly = true; EntryBBED.EncounteredNonLocalSideEffect = false; + ExitED.IsReachingAlignedBarrierOnly = true; } else { EntryBBED.IsExecutedByInitialThreadOnly = false; EntryBBED.IsReachedFromAlignedBarrierOnly = false; EntryBBED.EncounteredNonLocalSideEffect = true; + ExitED.IsReachingAlignedBarrierOnly = false; } } + bool Changed = false; auto &FnED = BEDMap[nullptr]; - FnED.IsReachingAlignedBarrierOnly &= - EntryBBED.IsReachedFromAlignedBarrierOnly; + Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly, + FnED.IsReachedFromAlignedBarrierOnly & + EntryBBED.IsReachedFromAlignedBarrierOnly); + Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly, + FnED.IsReachingAlignedBarrierOnly & + ExitED.IsReachingAlignedBarrierOnly); + Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly, + EntryBBED.IsExecutedByInitialThreadOnly); + return Changed; } ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { @@ -2860,36 +2917,28 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { // Helper to deal with an aligned barrier encountered during the forward // traversal. \p CB is the aligned barrier, \p ED is the execution domain when // it was encountered. - auto HandleAlignedBarrier = [&](CallBase *CB, ExecutionDomainTy &ED) { - if (CB) - Changed |= AlignedBarriers.insert(CB); + auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) { + Changed |= AlignedBarriers.insert(&CB); // First, update the barrier ED kept in the separate CEDMap. - auto &CallED = CEDMap[CB]; - mergeInPredecessor(A, CallED, ED); + auto &CallInED = CEDMap[{&CB, PRE}]; + Changed |= mergeInPredecessor(A, CallInED, ED); + CallInED.IsReachingAlignedBarrierOnly = true; // Next adjust the ED we use for the traversal. ED.EncounteredNonLocalSideEffect = false; ED.IsReachedFromAlignedBarrierOnly = true; // Aligned barrier collection has to come last. ED.clearAssumeInstAndAlignedBarriers(); - if (CB) - ED.addAlignedBarrier(A, *CB); + ED.addAlignedBarrier(A, CB); + auto &CallOutED = CEDMap[{&CB, POST}]; + Changed |= mergeInPredecessor(A, CallOutED, ED); }; - auto &LivenessAA = + auto *LivenessAA = A.getAAFor<AAIsDead>(*this, getIRPosition(), DepClassTy::OPTIONAL); - // Set \p R to \V and report true if that changed \p R. - auto SetAndRecord = [&](bool &R, bool V) { - bool Eq = (R == V); - R = V; - return !Eq; - }; - - auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); - Function *F = getAnchorScope(); BasicBlock &EntryBB = F->getEntryBlock(); - bool IsKernel = OMPInfoCache.Kernels.count(F); + bool IsKernel = omp::isKernel(*F); SmallVector<Instruction *> SyncInstWorklist; for (auto &RIt : *RPOT) { @@ -2899,18 +2948,19 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { // TODO: We use local reasoning since we don't have a divergence analysis // running as well. We could basically allow uniform branches here. bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel; + bool IsExplicitlyAligned = IsEntryBB && IsKernel; ExecutionDomainTy ED; // Propagate "incoming edges" into information about this block. if (IsEntryBB) { - handleEntryBB(A, ED); + Changed |= handleCallees(A, ED); } else { // For live non-entry blocks we only propagate // information via live edges. - if (LivenessAA.isAssumedDead(&BB)) + if (LivenessAA && LivenessAA->isAssumedDead(&BB)) continue; for (auto *PredBB : predecessors(&BB)) { - if (LivenessAA.isEdgeDead(PredBB, &BB)) + if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB)) continue; bool InitialEdgeOnly = isInitialThreadOnlyEdge( A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB); @@ -2922,7 +2972,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { // information to calls. for (Instruction &I : BB) { bool UsedAssumedInformation; - if (A.isAssumedDead(I, *this, &LivenessAA, UsedAssumedInformation, + if (A.isAssumedDead(I, *this, LivenessAA, UsedAssumedInformation, /* CheckBBLivenessOnly */ false, DepClassTy::OPTIONAL, /* CheckForDeadStore */ true)) continue; @@ -2939,6 +2989,33 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { continue; } + if (auto *FI = dyn_cast<FenceInst>(&I)) { + if (!ED.EncounteredNonLocalSideEffect) { + // An aligned fence without non-local side-effects is a no-op. + if (ED.IsReachedFromAlignedBarrierOnly) + continue; + // A non-aligned fence without non-local side-effects is a no-op + // if the ordering only publishes non-local side-effects (or less). + switch (FI->getOrdering()) { + case AtomicOrdering::NotAtomic: + continue; + case AtomicOrdering::Unordered: + continue; + case AtomicOrdering::Monotonic: + continue; + case AtomicOrdering::Acquire: + break; + case AtomicOrdering::Release: + continue; + case AtomicOrdering::AcquireRelease: + break; + case AtomicOrdering::SequentiallyConsistent: + break; + }; + } + NonNoOpFences.insert(FI); + } + auto *CB = dyn_cast<CallBase>(&I); bool IsNoSync = AA::isNoSyncInst(A, I, *this); bool IsAlignedBarrier = @@ -2946,14 +3023,16 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { AANoSync::isAlignedBarrier(*CB, AlignedBarrierLastInBlock); AlignedBarrierLastInBlock &= IsNoSync; + IsExplicitlyAligned &= IsNoSync; // Next we check for calls. Aligned barriers are handled // explicitly, everything else is kept for the backward traversal and will // also affect our state. if (CB) { if (IsAlignedBarrier) { - HandleAlignedBarrier(CB, ED); + HandleAlignedBarrier(*CB, ED); AlignedBarrierLastInBlock = true; + IsExplicitlyAligned = true; continue; } @@ -2971,20 +3050,20 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { // Record how we entered the call, then accumulate the effect of the // call in ED for potential use by the callee. - auto &CallED = CEDMap[CB]; - mergeInPredecessor(A, CallED, ED); + auto &CallInED = CEDMap[{CB, PRE}]; + Changed |= mergeInPredecessor(A, CallInED, ED); // If we have a sync-definition we can check if it starts/ends in an // aligned barrier. If we are unsure we assume any sync breaks // alignment. Function *Callee = CB->getCalledFunction(); if (!IsNoSync && Callee && !Callee->isDeclaration()) { - const auto &EDAA = A.getAAFor<AAExecutionDomain>( + const auto *EDAA = A.getAAFor<AAExecutionDomain>( *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL); - if (EDAA.getState().isValidState()) { - const auto &CalleeED = EDAA.getFunctionExecutionDomain(); + if (EDAA && EDAA->getState().isValidState()) { + const auto &CalleeED = EDAA->getFunctionExecutionDomain(); ED.IsReachedFromAlignedBarrierOnly = - CalleeED.IsReachedFromAlignedBarrierOnly; + CalleeED.IsReachedFromAlignedBarrierOnly; AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly; if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly) ED.EncounteredNonLocalSideEffect |= @@ -2992,19 +3071,27 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { else ED.EncounteredNonLocalSideEffect = CalleeED.EncounteredNonLocalSideEffect; - if (!CalleeED.IsReachingAlignedBarrierOnly) + if (!CalleeED.IsReachingAlignedBarrierOnly) { + Changed |= + setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false); SyncInstWorklist.push_back(&I); + } if (CalleeED.IsReachedFromAlignedBarrierOnly) mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED); + auto &CallOutED = CEDMap[{CB, POST}]; + Changed |= mergeInPredecessor(A, CallOutED, ED); continue; } } - ED.IsReachedFromAlignedBarrierOnly = - IsNoSync && ED.IsReachedFromAlignedBarrierOnly; + if (!IsNoSync) { + ED.IsReachedFromAlignedBarrierOnly = false; + Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false); + SyncInstWorklist.push_back(&I); + } AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly; ED.EncounteredNonLocalSideEffect |= !CB->doesNotAccessMemory(); - if (!IsNoSync) - SyncInstWorklist.push_back(&I); + auto &CallOutED = CEDMap[{CB, POST}]; + Changed |= mergeInPredecessor(A, CallOutED, ED); } if (!I.mayHaveSideEffects() && !I.mayReadFromMemory()) @@ -3013,7 +3100,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { // If we have a callee we try to use fine-grained information to // determine local side-effects. if (CB) { - const auto &MemAA = A.getAAFor<AAMemoryLocation>( + const auto *MemAA = A.getAAFor<AAMemoryLocation>( *this, IRPosition::callsite_function(*CB), DepClassTy::OPTIONAL); auto AccessPred = [&](const Instruction *I, const Value *Ptr, @@ -3021,13 +3108,14 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { AAMemoryLocation::MemoryLocationsKind) { return !AA::isPotentiallyAffectedByBarrier(A, {Ptr}, *this, I); }; - if (MemAA.getState().isValidState() && - MemAA.checkForAllAccessesToMemoryKind( + if (MemAA && MemAA->getState().isValidState() && + MemAA->checkForAllAccessesToMemoryKind( AccessPred, AAMemoryLocation::ALL_LOCATIONS)) continue; } - if (!I.mayHaveSideEffects() && OMPInfoCache.isOnlyUsedByAssume(I)) + auto &InfoCache = A.getInfoCache(); + if (!I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(I)) continue; if (auto *LI = dyn_cast<LoadInst>(&I)) @@ -3039,18 +3127,28 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { ED.EncounteredNonLocalSideEffect = true; } + bool IsEndAndNotReachingAlignedBarriersOnly = false; if (!isa<UnreachableInst>(BB.getTerminator()) && !BB.getTerminator()->getNumSuccessors()) { - auto &FnED = BEDMap[nullptr]; - mergeInPredecessor(A, FnED, ED); + Changed |= mergeInPredecessor(A, InterProceduralED, ED); - if (IsKernel) - HandleAlignedBarrier(nullptr, ED); + auto &FnED = BEDMap[nullptr]; + if (IsKernel && !IsExplicitlyAligned) + FnED.IsReachingAlignedBarrierOnly = false; + Changed |= mergeInPredecessor(A, FnED, ED); + + if (!FnED.IsReachingAlignedBarrierOnly) { + IsEndAndNotReachingAlignedBarriersOnly = true; + SyncInstWorklist.push_back(BB.getTerminator()); + auto &BBED = BEDMap[&BB]; + Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly, false); + } } ExecutionDomainTy &StoredED = BEDMap[&BB]; - ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly; + ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly & + !IsEndAndNotReachingAlignedBarriersOnly; // Check if we computed anything different as part of the forward // traversal. We do not take assumptions and aligned barriers into account @@ -3074,36 +3172,38 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { while (!SyncInstWorklist.empty()) { Instruction *SyncInst = SyncInstWorklist.pop_back_val(); Instruction *CurInst = SyncInst; - bool HitAlignedBarrier = false; + bool HitAlignedBarrierOrKnownEnd = false; while ((CurInst = CurInst->getPrevNode())) { auto *CB = dyn_cast<CallBase>(CurInst); if (!CB) continue; - auto &CallED = CEDMap[CB]; - if (SetAndRecord(CallED.IsReachingAlignedBarrierOnly, false)) - Changed = true; - HitAlignedBarrier = AlignedBarriers.count(CB); - if (HitAlignedBarrier) + auto &CallOutED = CEDMap[{CB, POST}]; + Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly, false); + auto &CallInED = CEDMap[{CB, PRE}]; + HitAlignedBarrierOrKnownEnd = + AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly; + if (HitAlignedBarrierOrKnownEnd) break; + Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false); } - if (HitAlignedBarrier) + if (HitAlignedBarrierOrKnownEnd) continue; BasicBlock *SyncBB = SyncInst->getParent(); for (auto *PredBB : predecessors(SyncBB)) { - if (LivenessAA.isEdgeDead(PredBB, SyncBB)) + if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB)) continue; if (!Visited.insert(PredBB)) continue; - SyncInstWorklist.push_back(PredBB->getTerminator()); auto &PredED = BEDMap[PredBB]; - if (SetAndRecord(PredED.IsReachingAlignedBarrierOnly, false)) + if (setAndRecord(PredED.IsReachingAlignedBarrierOnly, false)) { Changed = true; + SyncInstWorklist.push_back(PredBB->getTerminator()); + } } if (SyncBB != &EntryBB) continue; - auto &FnED = BEDMap[nullptr]; - if (SetAndRecord(FnED.IsReachingAlignedBarrierOnly, false)) - Changed = true; + Changed |= + setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly, false); } return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; @@ -3146,7 +3246,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A) : AAHeapToShared(IRP, A) {} - const std::string getAsStr() const override { + const std::string getAsStr(Attributor *) const override { return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) + " malloc calls eligible."; } @@ -3261,7 +3361,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue()); auto *SharedMem = new GlobalVariable( *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage, - UndefValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr, + PoisonValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr, GlobalValue::NotThreadLocal, static_cast<unsigned>(AddressSpace::Shared)); auto *NewBuffer = @@ -3270,7 +3370,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { auto Remark = [&](OptimizationRemark OR) { return OR << "Replaced globalized variable with " << ore::NV("SharedMemory", AllocSize->getZExtValue()) - << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ") + << (AllocSize->isOne() ? " byte " : " bytes ") << "of shared memory."; }; A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark); @@ -3278,7 +3378,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared { MaybeAlign Alignment = CB->getRetAlign(); assert(Alignment && "HeapToShared on allocation without alignment attribute"); - SharedMem->setAlignment(MaybeAlign(Alignment)); + SharedMem->setAlignment(*Alignment); A.changeAfterManifest(IRPosition::callsite_returned(*CB), *NewBuffer); A.deleteAfterManifest(*CB); @@ -3315,9 +3415,9 @@ struct AAHeapToSharedFunction : public AAHeapToShared { MallocCalls.remove(CB); continue; } - const auto &ED = A.getAAFor<AAExecutionDomain>( + const auto *ED = A.getAAFor<AAExecutionDomain>( *this, IRPosition::function(*F), DepClassTy::REQUIRED); - if (!ED.isExecutedByInitialThreadOnly(*CB)) + if (!ED || !ED->isExecutedByInitialThreadOnly(*CB)) MallocCalls.remove(CB); } } @@ -3346,7 +3446,7 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> { void trackStatistics() const override {} /// See AbstractAttribute::getAsStr() - const std::string getAsStr() const override { + const std::string getAsStr(Attributor *) const override { if (!isValidState()) return "<invalid>"; return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD" @@ -3456,22 +3556,7 @@ struct AAKernelInfoFunction : AAKernelInfo { Attributor::SimplifictionCallbackTy StateMachineSimplifyCB = [&](const IRPosition &IRP, const AbstractAttribute *AA, bool &UsedAssumedInformation) -> std::optional<Value *> { - // IRP represents the "use generic state machine" argument of an - // __kmpc_target_init call. We will answer this one with the internal - // state. As long as we are not in an invalid state, we will create a - // custom state machine so the value should be a `i1 false`. If we are - // in an invalid state, we won't change the value that is in the IR. - if (!ReachedKnownParallelRegions.isValidState()) - return nullptr; - // If we have disabled state machine rewrites, don't make a custom one. - if (DisableOpenMPOptStateMachineRewrite) return nullptr; - if (AA) - A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); - UsedAssumedInformation = !isAtFixpoint(); - auto *FalseVal = - ConstantInt::getBool(IRP.getAnchorValue().getContext(), false); - return FalseVal; }; Attributor::SimplifictionCallbackTy ModeSimplifyCB = @@ -3622,10 +3707,11 @@ struct AAKernelInfoFunction : AAKernelInfo { Function *Kernel = getAnchorScope(); Module &M = *Kernel->getParent(); Type *Int8Ty = Type::getInt8Ty(M.getContext()); - new GlobalVariable(M, Int8Ty, /* isConstant */ true, - GlobalValue::WeakAnyLinkage, - ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0), - Kernel->getName() + "_nested_parallelism"); + auto *GV = new GlobalVariable( + M, Int8Ty, /* isConstant */ true, GlobalValue::WeakAnyLinkage, + ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0), + Kernel->getName() + "_nested_parallelism"); + GV->setVisibility(GlobalValue::HiddenVisibility); // If we can we change the execution mode to SPMD-mode otherwise we build a // custom state machine. @@ -3914,6 +4000,12 @@ struct AAKernelInfoFunction : AAKernelInfo { bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) { auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); + // We cannot change to SPMD mode if the runtime functions aren't availible. + if (!OMPInfoCache.runtimeFnsAvailable( + {OMPRTL___kmpc_get_hardware_thread_id_in_block, + OMPRTL___kmpc_barrier_simple_spmd})) + return false; + if (!SPMDCompatibilityTracker.isAssumed()) { for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) { if (!NonCompatibleI) @@ -3951,7 +4043,7 @@ struct AAKernelInfoFunction : AAKernelInfo { auto *CB = cast<CallBase>(Kernel->user_back()); Kernel = CB->getCaller(); } - assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!"); + assert(omp::isKernel(*Kernel) && "Expected kernel function!"); // Check if the kernel is already in SPMD mode, if so, return success. GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( @@ -4021,6 +4113,13 @@ struct AAKernelInfoFunction : AAKernelInfo { if (!ReachedKnownParallelRegions.isValidState()) return ChangeStatus::UNCHANGED; + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); + if (!OMPInfoCache.runtimeFnsAvailable( + {OMPRTL___kmpc_get_hardware_num_threads_in_block, + OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic, + OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel})) + return ChangeStatus::UNCHANGED; + const int InitModeArgNo = 1; const int InitUseStateMachineArgNo = 2; @@ -4167,7 +4266,6 @@ struct AAKernelInfoFunction : AAKernelInfo { BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB); Module &M = *Kernel->getParent(); - auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); FunctionCallee BlockHwSizeFn = OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_get_hardware_num_threads_in_block); @@ -4220,10 +4318,7 @@ struct AAKernelInfoFunction : AAKernelInfo { if (WorkFnAI->getType()->getPointerAddressSpace() != (unsigned int)AddressSpace::Generic) { WorkFnAI = new AddrSpaceCastInst( - WorkFnAI, - PointerType::getWithSamePointeeType( - cast<PointerType>(WorkFnAI->getType()), - (unsigned int)AddressSpace::Generic), + WorkFnAI, PointerType::get(Ctx, (unsigned int)AddressSpace::Generic), WorkFnAI->getName() + ".generic", StateMachineBeginBB); WorkFnAI->setDebugLoc(DLoc); } @@ -4345,19 +4440,20 @@ struct AAKernelInfoFunction : AAKernelInfo { if (!I.mayWriteToMemory()) return true; if (auto *SI = dyn_cast<StoreInst>(&I)) { - const auto &UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>( + const auto *UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>( *this, IRPosition::value(*SI->getPointerOperand()), DepClassTy::OPTIONAL); - auto &HS = A.getAAFor<AAHeapToStack>( + auto *HS = A.getAAFor<AAHeapToStack>( *this, IRPosition::function(*I.getFunction()), DepClassTy::OPTIONAL); - if (UnderlyingObjsAA.forallUnderlyingObjects([&](Value &Obj) { + if (UnderlyingObjsAA && + UnderlyingObjsAA->forallUnderlyingObjects([&](Value &Obj) { if (AA::isAssumedThreadLocalObject(A, Obj, *this)) return true; // Check for AAHeapToStack moved objects which must not be // guarded. auto *CB = dyn_cast<CallBase>(&Obj); - return CB && HS.isAssumedHeapToStack(*CB); + return CB && HS && HS->isAssumedHeapToStack(*CB); })) return true; } @@ -4392,14 +4488,14 @@ struct AAKernelInfoFunction : AAKernelInfo { // we cannot fix the internal spmd-zation state either. int SPMD = 0, Generic = 0; for (auto *Kernel : ReachingKernelEntries) { - auto &CBAA = A.getAAFor<AAKernelInfo>( + auto *CBAA = A.getAAFor<AAKernelInfo>( *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL); - if (CBAA.SPMDCompatibilityTracker.isValidState() && - CBAA.SPMDCompatibilityTracker.isAssumed()) + if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() && + CBAA->SPMDCompatibilityTracker.isAssumed()) ++SPMD; else ++Generic; - if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint()) + if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint()) UsedAssumedInformationFromReachingKernels = true; } if (SPMD != 0 && Generic != 0) @@ -4413,14 +4509,16 @@ struct AAKernelInfoFunction : AAKernelInfo { bool AllSPMDStatesWereFixed = true; auto CheckCallInst = [&](Instruction &I) { auto &CB = cast<CallBase>(I); - auto &CBAA = A.getAAFor<AAKernelInfo>( + auto *CBAA = A.getAAFor<AAKernelInfo>( *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL); - getState() ^= CBAA.getState(); - AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint(); + if (!CBAA) + return false; + getState() ^= CBAA->getState(); + AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint(); AllParallelRegionStatesWereFixed &= - CBAA.ReachedKnownParallelRegions.isAtFixpoint(); + CBAA->ReachedKnownParallelRegions.isAtFixpoint(); AllParallelRegionStatesWereFixed &= - CBAA.ReachedUnknownParallelRegions.isAtFixpoint(); + CBAA->ReachedUnknownParallelRegions.isAtFixpoint(); return true; }; @@ -4460,10 +4558,10 @@ private: assert(Caller && "Caller is nullptr"); - auto &CAA = A.getOrCreateAAFor<AAKernelInfo>( + auto *CAA = A.getOrCreateAAFor<AAKernelInfo>( IRPosition::function(*Caller), this, DepClassTy::REQUIRED); - if (CAA.ReachingKernelEntries.isValidState()) { - ReachingKernelEntries ^= CAA.ReachingKernelEntries; + if (CAA && CAA->ReachingKernelEntries.isValidState()) { + ReachingKernelEntries ^= CAA->ReachingKernelEntries; return true; } @@ -4491,9 +4589,9 @@ private: assert(Caller && "Caller is nullptr"); - auto &CAA = + auto *CAA = A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller)); - if (CAA.ParallelLevels.isValidState()) { + if (CAA && CAA->ParallelLevels.isValidState()) { // Any function that is called by `__kmpc_parallel_51` will not be // folded as the parallel level in the function is updated. In order to // get it right, all the analysis would depend on the implentation. That @@ -4504,7 +4602,7 @@ private: return true; } - ParallelLevels ^= CAA.ParallelLevels; + ParallelLevels ^= CAA->ParallelLevels; return true; } @@ -4538,11 +4636,11 @@ struct AAKernelInfoCallSite : AAKernelInfo { CallBase &CB = cast<CallBase>(getAssociatedValue()); Function *Callee = getAssociatedFunction(); - auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>( + auto *AssumptionAA = A.getAAFor<AAAssumptionInfo>( *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL); // Check for SPMD-mode assumptions. - if (AssumptionAA.hasAssumption("ompx_spmd_amenable")) { + if (AssumptionAA && AssumptionAA->hasAssumption("ompx_spmd_amenable")) { SPMDCompatibilityTracker.indicateOptimisticFixpoint(); indicateOptimisticFixpoint(); } @@ -4567,8 +4665,9 @@ struct AAKernelInfoCallSite : AAKernelInfo { // Unknown callees might contain parallel regions, except if they have // an appropriate assumption attached. - if (!(AssumptionAA.hasAssumption("omp_no_openmp") || - AssumptionAA.hasAssumption("omp_no_parallelism"))) + if (!AssumptionAA || + !(AssumptionAA->hasAssumption("omp_no_openmp") || + AssumptionAA->hasAssumption("omp_no_parallelism"))) ReachedUnknownParallelRegions.insert(&CB); // If SPMDCompatibilityTracker is not fixed, we need to give up on the @@ -4643,11 +4742,11 @@ struct AAKernelInfoCallSite : AAKernelInfo { CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) { ReachedKnownParallelRegions.insert(ParallelRegion); /// Check nested parallelism - auto &FnAA = A.getAAFor<AAKernelInfo>( + auto *FnAA = A.getAAFor<AAKernelInfo>( *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL); - NestedParallelism |= !FnAA.getState().isValidState() || - !FnAA.ReachedKnownParallelRegions.empty() || - !FnAA.ReachedUnknownParallelRegions.empty(); + NestedParallelism |= !FnAA || !FnAA->getState().isValidState() || + !FnAA->ReachedKnownParallelRegions.empty() || + !FnAA->ReachedUnknownParallelRegions.empty(); break; } // The condition above should usually get the parallel region function @@ -4691,10 +4790,12 @@ struct AAKernelInfoCallSite : AAKernelInfo { // If F is not a runtime function, propagate the AAKernelInfo of the callee. if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) { const IRPosition &FnPos = IRPosition::function(*F); - auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED); - if (getState() == FnAA.getState()) + auto *FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED); + if (!FnAA) + return indicatePessimisticFixpoint(); + if (getState() == FnAA->getState()) return ChangeStatus::UNCHANGED; - getState() = FnAA.getState(); + getState() = FnAA->getState(); return ChangeStatus::CHANGED; } @@ -4707,9 +4808,9 @@ struct AAKernelInfoCallSite : AAKernelInfo { CallBase &CB = cast<CallBase>(getAssociatedValue()); - auto &HeapToStackAA = A.getAAFor<AAHeapToStack>( + auto *HeapToStackAA = A.getAAFor<AAHeapToStack>( *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL); - auto &HeapToSharedAA = A.getAAFor<AAHeapToShared>( + auto *HeapToSharedAA = A.getAAFor<AAHeapToShared>( *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL); RuntimeFunction RF = It->getSecond(); @@ -4718,13 +4819,15 @@ struct AAKernelInfoCallSite : AAKernelInfo { // If neither HeapToStack nor HeapToShared assume the call is removed, // assume SPMD incompatibility. case OMPRTL___kmpc_alloc_shared: - if (!HeapToStackAA.isAssumedHeapToStack(CB) && - !HeapToSharedAA.isAssumedHeapToShared(CB)) + if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) && + (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB))) SPMDCompatibilityTracker.insert(&CB); break; case OMPRTL___kmpc_free_shared: - if (!HeapToStackAA.isAssumedHeapToStackRemovedFree(CB) && - !HeapToSharedAA.isAssumedHeapToSharedRemovedFree(CB)) + if ((!HeapToStackAA || + !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) && + (!HeapToSharedAA || + !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB))) SPMDCompatibilityTracker.insert(&CB); break; default: @@ -4770,7 +4873,7 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { : AAFoldRuntimeCall(IRP, A) {} /// See AbstractAttribute::getAsStr() - const std::string getAsStr() const override { + const std::string getAsStr(Attributor *) const override { if (!isValidState()) return "<invalid>"; @@ -4883,28 +4986,29 @@ private: unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0; unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0; - auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( + auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); - if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState()) + if (!CallerKernelInfoAA || + !CallerKernelInfoAA->ReachingKernelEntries.isValidState()) return indicatePessimisticFixpoint(); - for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { - auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K), + for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) { + auto *AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K), DepClassTy::REQUIRED); - if (!AA.isValidState()) { + if (!AA || !AA->isValidState()) { SimplifiedValue = nullptr; return indicatePessimisticFixpoint(); } - if (AA.SPMDCompatibilityTracker.isAssumed()) { - if (AA.SPMDCompatibilityTracker.isAtFixpoint()) + if (AA->SPMDCompatibilityTracker.isAssumed()) { + if (AA->SPMDCompatibilityTracker.isAtFixpoint()) ++KnownSPMDCount; else ++AssumedSPMDCount; } else { - if (AA.SPMDCompatibilityTracker.isAtFixpoint()) + if (AA->SPMDCompatibilityTracker.isAtFixpoint()) ++KnownNonSPMDCount; else ++AssumedNonSPMDCount; @@ -4943,16 +5047,17 @@ private: ChangeStatus foldParallelLevel(Attributor &A) { std::optional<Value *> SimplifiedValueBefore = SimplifiedValue; - auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( + auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); - if (!CallerKernelInfoAA.ParallelLevels.isValidState()) + if (!CallerKernelInfoAA || + !CallerKernelInfoAA->ParallelLevels.isValidState()) return indicatePessimisticFixpoint(); - if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState()) + if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState()) return indicatePessimisticFixpoint(); - if (CallerKernelInfoAA.ReachingKernelEntries.empty()) { + if (CallerKernelInfoAA->ReachingKernelEntries.empty()) { assert(!SimplifiedValue && "SimplifiedValue should keep none at this point"); return ChangeStatus::UNCHANGED; @@ -4960,19 +5065,19 @@ private: unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0; unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0; - for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { - auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K), + for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) { + auto *AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K), DepClassTy::REQUIRED); - if (!AA.SPMDCompatibilityTracker.isValidState()) + if (!AA || !AA->SPMDCompatibilityTracker.isValidState()) return indicatePessimisticFixpoint(); - if (AA.SPMDCompatibilityTracker.isAssumed()) { - if (AA.SPMDCompatibilityTracker.isAtFixpoint()) + if (AA->SPMDCompatibilityTracker.isAssumed()) { + if (AA->SPMDCompatibilityTracker.isAtFixpoint()) ++KnownSPMDCount; else ++AssumedSPMDCount; } else { - if (AA.SPMDCompatibilityTracker.isAtFixpoint()) + if (AA->SPMDCompatibilityTracker.isAtFixpoint()) ++KnownNonSPMDCount; else ++AssumedNonSPMDCount; @@ -5005,14 +5110,15 @@ private: int32_t CurrentAttrValue = -1; std::optional<Value *> SimplifiedValueBefore = SimplifiedValue; - auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( + auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); - if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState()) + if (!CallerKernelInfoAA || + !CallerKernelInfoAA->ReachingKernelEntries.isValidState()) return indicatePessimisticFixpoint(); // Iterate over the kernels that reach this function - for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { + for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) { int32_t NextAttrVal = K->getFnAttributeAsParsedInteger(Attr, -1); if (NextAttrVal == -1 || @@ -5135,6 +5241,8 @@ void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) { A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F)); if (!DisableOpenMPOptDeglobalization) A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(F)); + if (F.hasFnAttribute(Attribute::Convergent)) + A.getOrCreateAAFor<AANonConvergent>(IRPosition::function(F)); for (auto &I : instructions(F)) { if (auto *LI = dyn_cast<LoadInst>(&I)) { @@ -5147,6 +5255,10 @@ void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) { A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI)); continue; } + if (auto *FI = dyn_cast<FenceInst>(&I)) { + A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*FI)); + continue; + } if (auto *II = dyn_cast<IntrinsicInst>(&I)) { if (II->getIntrinsicID() == Intrinsic::assume) { A.getOrCreateAAFor<AAPotentialValues>( @@ -5304,6 +5416,8 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { }); }; + bool Changed = false; + // Create internal copies of each function if this is a kernel Module. This // allows iterprocedural passes to see every call edge. DenseMap<Function *, Function *> InternalizedMap; @@ -5319,7 +5433,8 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { } } - Attributor::internalizeFunctions(InternalizeFns, InternalizedMap); + Changed |= + Attributor::internalizeFunctions(InternalizeFns, InternalizedMap); } // Look at every function in the Module unless it was internalized. @@ -5332,7 +5447,7 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { } if (SCC.empty()) - return PreservedAnalyses::all(); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); AnalysisGetter AG(FAM); @@ -5343,7 +5458,9 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { BumpPtrAllocator Allocator; CallGraphUpdater CGUpdater; - OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, Kernels); + bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink || + LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink; + OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, PostLink); unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? SetFixpointIterations : 32; @@ -5356,11 +5473,14 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { AC.OREGetter = OREGetter; AC.PassName = DEBUG_TYPE; AC.InitializationCallback = OpenMPOpt::registerAAsForFunction; + AC.IPOAmendableCB = [](const Function &F) { + return F.hasFnAttribute("kernel"); + }; Attributor A(Functions, InfoCache, AC); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); - bool Changed = OMPOpt.run(true); + Changed |= OMPOpt.run(true); // Optionally inline device functions for potentially better performance. if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M)) @@ -5417,9 +5537,11 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, CallGraphUpdater CGUpdater; CGUpdater.initialize(CG, C, AM, UR); + bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink || + LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink; SetVector<Function *> Functions(SCC.begin(), SCC.end()); OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, - /*CGSCC*/ &Functions, Kernels); + /*CGSCC*/ &Functions, PostLink); unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? SetFixpointIterations : 32; @@ -5447,6 +5569,8 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, return PreservedAnalyses::all(); } +bool llvm::omp::isKernel(Function &Fn) { return Fn.hasFnAttribute("kernel"); } + KernelSet llvm::omp::getDeviceKernels(Module &M) { // TODO: Create a more cross-platform way of determining device kernels. NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"); @@ -5467,6 +5591,7 @@ KernelSet llvm::omp::getDeviceKernels(Module &M) { if (!KernelFn) continue; + assert(isKernel(*KernelFn) && "Inconsistent kernel function annotation"); ++NumOpenMPTargetRegionKernels; Kernels.insert(KernelFn); |