1 files changed, 412 insertions, 287 deletions
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index bee154dab10f..588f3901e3cb 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -22,8 +22,10 @@
 #include "llvm/ADT/EnumeratedArray.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
@@ -36,6 +38,8 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
@@ -44,7 +48,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/InitializePasses.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/IPO/Attributor.h"
@@ -188,9 +192,9 @@ struct AAICVTracker;
 struct OMPInformationCache : public InformationCache {
   OMPInformationCache(Module &M, AnalysisGetter &AG,
                       BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC,
-                      KernelSet &Kernels)
+                      bool OpenMPPostLink)
       : InformationCache(M, AG, Allocator, CGSCC), OMPBuilder(M),
-        Kernels(Kernels) {
+        OpenMPPostLink(OpenMPPostLink) {
 
     OMPBuilder.initialize();
     initializeRuntimeFunctions(M);
@@ -417,7 +421,7 @@ struct OMPInformationCache : public InformationCache {
     // TODO: We directly convert uses into proper calls and unknown uses.
     for (Use &U : RFI.Declaration->uses()) {
       if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
-        if (ModuleSlice.empty() || ModuleSlice.count(UserI->getFunction())) {
+        if (!CGSCC || CGSCC->empty() || CGSCC->contains(UserI->getFunction())) {
           RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
           ++NumUses;
         }
@@ -448,6 +452,24 @@ struct OMPInformationCache : public InformationCache {
       CI->setCallingConv(Fn->getCallingConv());
   }
 
+  // Helper function to determine if it's legal to create a call to the runtime
+  // functions.
+  bool runtimeFnsAvailable(ArrayRef<RuntimeFunction> Fns) {
+    // We can always emit calls if we haven't yet linked in the runtime.
+    if (!OpenMPPostLink)
+      return true;
+
+    // Once the runtime has been already been linked in we cannot emit calls to
+    // any undefined functions.
+    for (RuntimeFunction Fn : Fns) {
+      RuntimeFunctionInfo &RFI = RFIs[Fn];
+
+      if (RFI.Declaration && RFI.Declaration->isDeclaration())
+        return false;
+    }
+    return true;
+  }
+
   /// Helper to initialize all runtime function information for those defined
   /// in OpenMPKinds.def.
   void initializeRuntimeFunctions(Module &M) {
@@ -518,11 +540,11 @@ struct OMPInformationCache : public InformationCache {
     // TODO: We should attach the attributes defined in OMPKinds.def.
   }
 
-  /// Collection of known kernels (\see Kernel) in the module.
-  KernelSet &Kernels;
-
   /// Collection of known OpenMP runtime functions..
   DenseSet<const Function *> RTLFunctions;
+
+  /// Indicates if we have already linked in the OpenMP device library.
+  bool OpenMPPostLink = false;
 };
 
 template <typename Ty, bool InsertInvalidates = true>
@@ -808,7 +830,7 @@ struct OpenMPOpt {
     return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
   }
 
-  /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
+  /// Run all OpenMP optimizations on the underlying SCC.
   bool run(bool IsModulePass) {
     if (SCC.empty())
       return false;
@@ -816,8 +838,7 @@ struct OpenMPOpt {
     bool Changed = false;
 
     LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
-                      << " functions in a slice with "
-                      << OMPInfoCache.ModuleSlice.size() << " functions\n");
+                      << " functions\n");
 
     if (IsModulePass) {
       Changed |= runAttributor(IsModulePass);
@@ -882,7 +903,7 @@ struct OpenMPOpt {
   /// Print OpenMP GPU kernels for testing.
   void printKernels() const {
     for (Function *F : SCC) {
-      if (!OMPInfoCache.Kernels.count(F))
+      if (!omp::isKernel(*F))
         continue;
 
       auto Remark = [&](OptimizationRemarkAnalysis ORA) {
@@ -1412,7 +1433,10 @@ private:
       Changed |= WasSplit;
       return WasSplit;
     };
-    RFI.foreachUse(SCC, SplitMemTransfers);
+    if (OMPInfoCache.runtimeFnsAvailable(
+            {OMPRTL___tgt_target_data_begin_mapper_issue,
+             OMPRTL___tgt_target_data_begin_mapper_wait}))
+      RFI.foreachUse(SCC, SplitMemTransfers);
 
     return Changed;
   }
@@ -1681,37 +1705,27 @@ private:
     };
 
     if (!ReplVal) {
-      for (Use *U : *UV)
+      auto *DT =
+          OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(F);
+      if (!DT)
+        return false;
+      Instruction *IP = nullptr;
+      for (Use *U : *UV) {
         if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
+          if (IP)
+            IP = DT->findNearestCommonDominator(IP, CI);
+          else
+            IP = CI;
           if (!CanBeMoved(*CI))
             continue;
-
-          // If the function is a kernel, dedup will move
-          // the runtime call right after the kernel init callsite. Otherwise,
-          // it will move it to the beginning of the caller function.
-          if (isKernel(F)) {
-            auto &KernelInitRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
-            auto *KernelInitUV = KernelInitRFI.getUseVector(F);
-
-            if (KernelInitUV->empty())
-              continue;
-
-            assert(KernelInitUV->size() == 1 &&
-                   "Expected a single __kmpc_target_init in kernel\n");
-
-            CallInst *KernelInitCI =
-                getCallIfRegularCall(*KernelInitUV->front(), &KernelInitRFI);
-            assert(KernelInitCI &&
-                   "Expected a call to __kmpc_target_init in kernel\n");
-
-            CI->moveAfter(KernelInitCI);
-          } else
-            CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
-          ReplVal = CI;
-          break;
+          if (!ReplVal)
+            ReplVal = CI;
         }
+      }
       if (!ReplVal)
         return false;
+      assert(IP && "Expected insertion point!");
+      cast<Instruction>(ReplVal)->moveBefore(IP);
     }
 
     // If we use a call as a replacement value we need to make sure the ident is
@@ -1809,9 +1823,6 @@ private:
   ///
   ///{{
 
-  /// Check if \p F is a kernel, hence entry point for target offloading.
-  bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
-
   /// Cache to remember the unique kernel for a function.
   DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
 
@@ -1920,7 +1931,8 @@ public:
 };
 
 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
-  if (!OMPInfoCache.ModuleSlice.empty() && !OMPInfoCache.ModuleSlice.count(&F))
+  if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
+      !OMPInfoCache.CGSCC->contains(&F))
     return nullptr;
 
   // Use a scope to keep the lifetime of the CachedKernel short.
@@ -2095,12 +2107,6 @@ struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
   using Base = StateWrapper<BooleanState, AbstractAttribute>;
   AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
 
-  void initialize(Attributor &A) override {
-    Function *F = getAnchorScope();
-    if (!F || !A.isFunctionIPOAmendable(*F))
-      indicatePessimisticFixpoint();
-  }
-
   /// Returns true if value is assumed to be tracked.
   bool isAssumedTracked() const { return getAssumed(); }
 
@@ -2146,7 +2152,9 @@ struct AAICVTrackerFunction : public AAICVTracker {
       : AAICVTracker(IRP, A) {}
 
   // FIXME: come up with better string.
-  const std::string getAsStr() const override { return "ICVTrackerFunction"; }
+  const std::string getAsStr(Attributor *) const override {
+    return "ICVTrackerFunction";
+  }
 
   // FIXME: come up with some stats.
   void trackStatistics() const override {}
@@ -2242,11 +2250,12 @@ struct AAICVTrackerFunction : public AAICVTracker {
     if (CalledFunction->isDeclaration())
       return nullptr;
 
-    const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+    const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
         *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
 
-    if (ICVTrackingAA.isAssumedTracked()) {
-      std::optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV);
+    if (ICVTrackingAA->isAssumedTracked()) {
+      std::optional<Value *> URV =
+          ICVTrackingAA->getUniqueReplacementValue(ICV);
       if (!URV || (*URV && AA::isValidAtPosition(AA::ValueAndContext(**URV, I),
                                                  OMPInfoCache)))
         return URV;
@@ -2337,7 +2346,7 @@ struct AAICVTrackerFunctionReturned : AAICVTracker {
       : AAICVTracker(IRP, A) {}
 
   // FIXME: come up with better string.
-  const std::string getAsStr() const override {
+  const std::string getAsStr(Attributor *) const override {
     return "ICVTrackerFunctionReturned";
   }
 
@@ -2362,10 +2371,10 @@ struct AAICVTrackerFunctionReturned : AAICVTracker {
 
   ChangeStatus updateImpl(Attributor &A) override {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
-    const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+    const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
         *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
 
-    if (!ICVTrackingAA.isAssumedTracked())
+    if (!ICVTrackingAA->isAssumedTracked())
       return indicatePessimisticFixpoint();
 
     for (InternalControlVar ICV : TrackableICVs) {
@@ -2374,7 +2383,7 @@ struct AAICVTrackerFunctionReturned : AAICVTracker {
 
       auto CheckReturnInst = [&](Instruction &I) {
         std::optional<Value *> NewReplVal =
-            ICVTrackingAA.getReplacementValue(ICV, &I, A);
+            ICVTrackingAA->getReplacementValue(ICV, &I, A);
 
         // If we found a second ICV value there is no unique returned value.
         if (UniqueICVValue && UniqueICVValue != NewReplVal)
@@ -2407,9 +2416,7 @@ struct AAICVTrackerCallSite : AAICVTracker {
       : AAICVTracker(IRP, A) {}
 
   void initialize(Attributor &A) override {
-    Function *F = getAnchorScope();
-    if (!F || !A.isFunctionIPOAmendable(*F))
-      indicatePessimisticFixpoint();
+    assert(getAnchorScope() && "Expected anchor function");
 
     // We only initialize this AA for getters, so we need to know which ICV it
     // gets.
@@ -2438,7 +2445,9 @@ struct AAICVTrackerCallSite : AAICVTracker {
   }
 
   // FIXME: come up with better string.
-  const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
+  const std::string getAsStr(Attributor *) const override {
+    return "ICVTrackerCallSite";
+  }
 
   // FIXME: come up with some stats.
   void trackStatistics() const override {}
@@ -2447,15 +2456,15 @@ struct AAICVTrackerCallSite : AAICVTracker {
   std::optional<Value *> ReplVal;
 
   ChangeStatus updateImpl(Attributor &A) override {
-    const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+    const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
         *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
 
     // We don't have any information, so we assume it changes the ICV.
-    if (!ICVTrackingAA.isAssumedTracked())
+    if (!ICVTrackingAA->isAssumedTracked())
       return indicatePessimisticFixpoint();
 
     std::optional<Value *> NewReplVal =
-        ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
+        ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(), A);
 
     if (ReplVal == NewReplVal)
       return ChangeStatus::UNCHANGED;
@@ -2477,7 +2486,7 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker {
       : AAICVTracker(IRP, A) {}
 
   // FIXME: come up with better string.
-  const std::string getAsStr() const override {
+  const std::string getAsStr(Attributor *) const override {
     return "ICVTrackerCallSiteReturned";
   }
 
@@ -2503,18 +2512,18 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker {
 
   ChangeStatus updateImpl(Attributor &A) override {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
-    const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+    const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
         *this, IRPosition::returned(*getAssociatedFunction()),
         DepClassTy::REQUIRED);
 
     // We don't have any information, so we assume it changes the ICV.
-    if (!ICVTrackingAA.isAssumedTracked())
+    if (!ICVTrackingAA->isAssumedTracked())
       return indicatePessimisticFixpoint();
 
     for (InternalControlVar ICV : TrackableICVs) {
       std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
       std::optional<Value *> NewReplVal =
-          ICVTrackingAA.getUniqueReplacementValue(ICV);
+          ICVTrackingAA->getUniqueReplacementValue(ICV);
 
       if (ReplVal == NewReplVal)
         continue;
@@ -2530,26 +2539,28 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
   AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
       : AAExecutionDomain(IRP, A) {}
 
-  ~AAExecutionDomainFunction() {
-    delete RPOT;
-  }
+  ~AAExecutionDomainFunction() { delete RPOT; }
 
   void initialize(Attributor &A) override {
-    if (getAnchorScope()->isDeclaration()) {
-      indicatePessimisticFixpoint();
-      return;
-    }
-    RPOT = new ReversePostOrderTraversal<Function *>(getAnchorScope());
+    Function *F = getAnchorScope();
+    assert(F && "Expected anchor function");
+    RPOT = new ReversePostOrderTraversal<Function *>(F);
   }
 
-  const std::string getAsStr() const override {
-    unsigned TotalBlocks = 0, InitialThreadBlocks = 0;
+  const std::string getAsStr(Attributor *) const override {
+    unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
     for (auto &It : BEDMap) {
+      if (!It.getFirst())
+        continue;
       TotalBlocks++;
       InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
+      AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
+                       It.getSecond().IsReachingAlignedBarrierOnly;
     }
     return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) + "/" +
-           std::to_string(TotalBlocks) + " executed by initial thread only";
+           std::to_string(AlignedBlocks) + " of " +
+           std::to_string(TotalBlocks) +
+           " executed by initial thread / aligned";
   }
 
   /// See AbstractAttribute::trackStatistics().
@@ -2572,7 +2583,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
 
     SmallPtrSet<CallBase *, 16> DeletedBarriers;
     auto HandleAlignedBarrier = [&](CallBase *CB) {
-      const ExecutionDomainTy &ED = CEDMap[CB];
+      const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[nullptr];
       if (!ED.IsReachedFromAlignedBarrierOnly ||
           ED.EncounteredNonLocalSideEffect)
         return;
@@ -2596,6 +2607,8 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
           CallBase *LastCB = Worklist.pop_back_val();
           if (!Visited.insert(LastCB))
             continue;
+          if (LastCB->getFunction() != getAnchorScope())
+            continue;
           if (!DeletedBarriers.count(LastCB)) {
             A.deleteAfterManifest(*LastCB);
             continue;
@@ -2603,7 +2616,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
           // The final aligned barrier (LastCB) reaching the kernel end was
           // removed already. This means we can go one step further and remove
           // the barriers encoutered last before (LastCB).
-          const ExecutionDomainTy &LastED = CEDMap[LastCB];
+          const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
           Worklist.append(LastED.AlignedBarriers.begin(),
                           LastED.AlignedBarriers.end());
         }
@@ -2619,14 +2632,17 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
     for (auto *CB : AlignedBarriers)
       HandleAlignedBarrier(CB);
 
-    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
     // Handle the "kernel end barrier" for kernels too.
-    if (OMPInfoCache.Kernels.count(getAnchorScope()))
+    if (omp::isKernel(*getAnchorScope()))
       HandleAlignedBarrier(nullptr);
 
     return Changed;
   }
 
+  bool isNoOpFence(const FenceInst &FI) const override {
+    return getState().isValidState() && !NonNoOpFences.count(&FI);
+  }
+
   /// Merge barrier and assumption information from \p PredED into the successor
   /// \p ED.
   void
@@ -2636,12 +2652,12 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
   /// Merge all information from \p PredED into the successor \p ED. If
   /// \p InitialEdgeOnly is set, only the initial edge will enter the block
   /// represented by \p ED from this predecessor.
-  void mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,
+  bool mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,
                           const ExecutionDomainTy &PredED,
                           bool InitialEdgeOnly = false);
 
   /// Accumulate information for the entry block in \p EntryBBED.
-  void handleEntryBB(Attributor &A, ExecutionDomainTy &EntryBBED);
+  bool handleCallees(Attributor &A, ExecutionDomainTy &EntryBBED);
 
   /// See AbstractAttribute::updateImpl.
   ChangeStatus updateImpl(Attributor &A) override;
@@ -2651,14 +2667,18 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
   bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
     if (!isValidState())
       return false;
+    assert(BB.getParent() == getAnchorScope() && "Block is out of scope!");
     return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
   }
 
   bool isExecutedInAlignedRegion(Attributor &A,
                                  const Instruction &I) const override {
-    if (!isValidState() || isa<CallBase>(I))
+    assert(I.getFunction() == getAnchorScope() &&
+           "Instruction is out of scope!");
+    if (!isValidState())
       return false;
 
+    bool ForwardIsOk = true;
     const Instruction *CurI;
 
     // Check forward until a call or the block end is reached.
@@ -2667,15 +2687,18 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
       auto *CB = dyn_cast<CallBase>(CurI);
       if (!CB)
         continue;
-      const auto &It = CEDMap.find(CB);
+      if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB)))
+        return true;
+      const auto &It = CEDMap.find({CB, PRE});
       if (It == CEDMap.end())
         continue;
-      if (!It->getSecond().IsReachedFromAlignedBarrierOnly)
-        return false;
+      if (!It->getSecond().IsReachingAlignedBarrierOnly)
+        ForwardIsOk = false;
+      break;
     } while ((CurI = CurI->getNextNonDebugInstruction()));
 
-    if (!CurI && !BEDMap.lookup(I.getParent()).IsReachedFromAlignedBarrierOnly)
-      return false;
+    if (!CurI && !BEDMap.lookup(I.getParent()).IsReachingAlignedBarrierOnly)
+      ForwardIsOk = false;
 
     // Check backward until a call or the block beginning is reached.
     CurI = &I;
@@ -2683,33 +2706,30 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
       auto *CB = dyn_cast<CallBase>(CurI);
       if (!CB)
         continue;
-      const auto &It = CEDMap.find(CB);
+      if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB)))
+        return true;
+      const auto &It = CEDMap.find({CB, POST});
       if (It == CEDMap.end())
         continue;
-      if (!AA::isNoSyncInst(A, *CB, *this)) {
-        if (It->getSecond().IsReachedFromAlignedBarrierOnly)
-          break;
-        return false;
-      }
-
-      Function *Callee = CB->getCalledFunction();
-      if (!Callee || Callee->isDeclaration())
-        return false;
-      const auto &EDAA = A.getAAFor<AAExecutionDomain>(
-          *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
-      if (!EDAA.getState().isValidState())
-        return false;
-      if (!EDAA.getFunctionExecutionDomain().IsReachedFromAlignedBarrierOnly)
-        return false;
-      break;
+      if (It->getSecond().IsReachedFromAlignedBarrierOnly)
+        break;
+      return false;
     } while ((CurI = CurI->getPrevNonDebugInstruction()));
 
-    if (!CurI &&
-        !llvm::all_of(
-            predecessors(I.getParent()), [&](const BasicBlock *PredBB) {
-              return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
-            })) {
+    // Delayed decision on the forward pass to allow aligned barrier detection
+    // in the backwards traversal.
+    if (!ForwardIsOk)
       return false;
+
+    if (!CurI) {
+      const BasicBlock *BB = I.getParent();
+      if (BB == &BB->getParent()->getEntryBlock())
+        return BEDMap.lookup(nullptr).IsReachedFromAlignedBarrierOnly;
+      if (!llvm::all_of(predecessors(BB), [&](const BasicBlock *PredBB) {
+            return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
+          })) {
+        return false;
+      }
     }
 
     // On neither traversal we found a anything but aligned barriers.
@@ -2721,15 +2741,16 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
            "No request should be made against an invalid state!");
     return BEDMap.lookup(&BB);
   }
-  ExecutionDomainTy getExecutionDomain(const CallBase &CB) const override {
+  std::pair<ExecutionDomainTy, ExecutionDomainTy>
+  getExecutionDomain(const CallBase &CB) const override {
     assert(isValidState() &&
            "No request should be made against an invalid state!");
-    return CEDMap.lookup(&CB);
+    return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
   }
   ExecutionDomainTy getFunctionExecutionDomain() const override {
     assert(isValidState() &&
            "No request should be made against an invalid state!");
-    return BEDMap.lookup(nullptr);
+    return InterProceduralED;
   }
   ///}
 
@@ -2778,12 +2799,28 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
     return false;
   };
 
+  /// Mapping containing information about the function for other AAs.
+  ExecutionDomainTy InterProceduralED;
+
+  enum Direction { PRE = 0, POST = 1 };
   /// Mapping containing information per block.
   DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
-  DenseMap<const CallBase *, ExecutionDomainTy> CEDMap;
+  DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy>
+      CEDMap;
   SmallSetVector<CallBase *, 16> AlignedBarriers;
 
   ReversePostOrderTraversal<Function *> *RPOT = nullptr;
+
+  /// Set \p R to \V and report true if that changed \p R.
+  static bool setAndRecord(bool &R, bool V) {
+    bool Eq = (R == V);
+    R = V;
+    return !Eq;
+  }
+
+  /// Collection of fences known to be non-no-opt. All fences not in this set
+  /// can be assumed no-opt.
+  SmallPtrSet<const FenceInst *, 8> NonNoOpFences;
 };
 
 void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
@@ -2795,62 +2832,82 @@ void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
     ED.addAlignedBarrier(A, *AB);
 }
 
-void AAExecutionDomainFunction::mergeInPredecessor(
+bool AAExecutionDomainFunction::mergeInPredecessor(
     Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,
     bool InitialEdgeOnly) {
-  ED.IsExecutedByInitialThreadOnly =
-      InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
-                          ED.IsExecutedByInitialThreadOnly);
-
-  ED.IsReachedFromAlignedBarrierOnly = ED.IsReachedFromAlignedBarrierOnly &&
-                                       PredED.IsReachedFromAlignedBarrierOnly;
-  ED.EncounteredNonLocalSideEffect =
-      ED.EncounteredNonLocalSideEffect | PredED.EncounteredNonLocalSideEffect;
+
+  bool Changed = false;
+  Changed |=
+      setAndRecord(ED.IsExecutedByInitialThreadOnly,
+                   InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
+                                       ED.IsExecutedByInitialThreadOnly));
+
+  Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
+                          ED.IsReachedFromAlignedBarrierOnly &&
+                              PredED.IsReachedFromAlignedBarrierOnly);
+  Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
+                          ED.EncounteredNonLocalSideEffect |
+                              PredED.EncounteredNonLocalSideEffect);
+  // Do not track assumptions and barriers as part of Changed.
   if (ED.IsReachedFromAlignedBarrierOnly)
     mergeInPredecessorBarriersAndAssumptions(A, ED, PredED);
   else
     ED.clearAssumeInstAndAlignedBarriers();
+  return Changed;
 }
 
-void AAExecutionDomainFunction::handleEntryBB(Attributor &A,
+bool AAExecutionDomainFunction::handleCallees(Attributor &A,
                                               ExecutionDomainTy &EntryBBED) {
-  SmallVector<ExecutionDomainTy> PredExecDomains;
+  SmallVector<std::pair<ExecutionDomainTy, ExecutionDomainTy>, 4> CallSiteEDs;
   auto PredForCallSite = [&](AbstractCallSite ACS) {
-    const auto &EDAA = A.getAAFor<AAExecutionDomain>(
+    const auto *EDAA = A.getAAFor<AAExecutionDomain>(
         *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
         DepClassTy::OPTIONAL);
-    if (!EDAA.getState().isValidState())
+    if (!EDAA || !EDAA->getState().isValidState())
       return false;
-    PredExecDomains.emplace_back(
-        EDAA.getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
+    CallSiteEDs.emplace_back(
+        EDAA->getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
     return true;
   };
 
+  ExecutionDomainTy ExitED;
   bool AllCallSitesKnown;
   if (A.checkForAllCallSites(PredForCallSite, *this,
                              /* RequiresAllCallSites */ true,
                              AllCallSitesKnown)) {
-    for (const auto &PredED : PredExecDomains)
-      mergeInPredecessor(A, EntryBBED, PredED);
+    for (const auto &[CSInED, CSOutED] : CallSiteEDs) {
+      mergeInPredecessor(A, EntryBBED, CSInED);
+      ExitED.IsReachingAlignedBarrierOnly &=
+          CSOutED.IsReachingAlignedBarrierOnly;
+    }
 
   } else {
     // We could not find all predecessors, so this is either a kernel or a
     // function with external linkage (or with some other weird uses).
-    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
-    if (OMPInfoCache.Kernels.count(getAnchorScope())) {
+    if (omp::isKernel(*getAnchorScope())) {
       EntryBBED.IsExecutedByInitialThreadOnly = false;
       EntryBBED.IsReachedFromAlignedBarrierOnly = true;
       EntryBBED.EncounteredNonLocalSideEffect = false;
+      ExitED.IsReachingAlignedBarrierOnly = true;
     } else {
       EntryBBED.IsExecutedByInitialThreadOnly = false;
       EntryBBED.IsReachedFromAlignedBarrierOnly = false;
       EntryBBED.EncounteredNonLocalSideEffect = true;
+      ExitED.IsReachingAlignedBarrierOnly = false;
     }
   }
 
+  bool Changed = false;
   auto &FnED = BEDMap[nullptr];
-  FnED.IsReachingAlignedBarrierOnly &=
-      EntryBBED.IsReachedFromAlignedBarrierOnly;
+  Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
+                          FnED.IsReachedFromAlignedBarrierOnly &
+                              EntryBBED.IsReachedFromAlignedBarrierOnly);
+  Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
+                          FnED.IsReachingAlignedBarrierOnly &
+                              ExitED.IsReachingAlignedBarrierOnly);
+  Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
+                          EntryBBED.IsExecutedByInitialThreadOnly);
+  return Changed;
 }
 
 ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
@@ -2860,36 +2917,28 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
   // Helper to deal with an aligned barrier encountered during the forward
   // traversal. \p CB is the aligned barrier, \p ED is the execution domain when
   // it was encountered.
-  auto HandleAlignedBarrier = [&](CallBase *CB, ExecutionDomainTy &ED) {
-    if (CB)
-      Changed |= AlignedBarriers.insert(CB);
+  auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {
+    Changed |= AlignedBarriers.insert(&CB);
     // First, update the barrier ED kept in the separate CEDMap.
-    auto &CallED = CEDMap[CB];
-    mergeInPredecessor(A, CallED, ED);
+    auto &CallInED = CEDMap[{&CB, PRE}];
+    Changed |= mergeInPredecessor(A, CallInED, ED);
+    CallInED.IsReachingAlignedBarrierOnly = true;
     // Next adjust the ED we use for the traversal.
     ED.EncounteredNonLocalSideEffect = false;
     ED.IsReachedFromAlignedBarrierOnly = true;
     // Aligned barrier collection has to come last.
     ED.clearAssumeInstAndAlignedBarriers();
-    if (CB)
-      ED.addAlignedBarrier(A, *CB);
+    ED.addAlignedBarrier(A, CB);
+    auto &CallOutED = CEDMap[{&CB, POST}];
+    Changed |= mergeInPredecessor(A, CallOutED, ED);
   };
 
-  auto &LivenessAA =
+  auto *LivenessAA =
       A.getAAFor<AAIsDead>(*this, getIRPosition(), DepClassTy::OPTIONAL);
 
-  // Set \p R to \V and report true if that changed \p R.
-  auto SetAndRecord = [&](bool &R, bool V) {
-    bool Eq = (R == V);
-    R = V;
-    return !Eq;
-  };
-
-  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
-
   Function *F = getAnchorScope();
   BasicBlock &EntryBB = F->getEntryBlock();
-  bool IsKernel = OMPInfoCache.Kernels.count(F);
+  bool IsKernel = omp::isKernel(*F);
 
   SmallVector<Instruction *> SyncInstWorklist;
   for (auto &RIt : *RPOT) {
@@ -2899,18 +2948,19 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
     // TODO: We use local reasoning since we don't have a divergence analysis
     // 	     running as well. We could basically allow uniform branches here.
     bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
+    bool IsExplicitlyAligned = IsEntryBB && IsKernel;
     ExecutionDomainTy ED;
     // Propagate "incoming edges" into information about this block.
     if (IsEntryBB) {
-      handleEntryBB(A, ED);
+      Changed |= handleCallees(A, ED);
     } else {
       // For live non-entry blocks we only propagate
       // information via live edges.
-      if (LivenessAA.isAssumedDead(&BB))
+      if (LivenessAA && LivenessAA->isAssumedDead(&BB))
         continue;
 
       for (auto *PredBB : predecessors(&BB)) {
-        if (LivenessAA.isEdgeDead(PredBB, &BB))
+        if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
           continue;
         bool InitialEdgeOnly = isInitialThreadOnlyEdge(
             A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
@@ -2922,7 +2972,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
     // information to calls.
     for (Instruction &I : BB) {
       bool UsedAssumedInformation;
-      if (A.isAssumedDead(I, *this, &LivenessAA, UsedAssumedInformation,
+      if (A.isAssumedDead(I, *this, LivenessAA, UsedAssumedInformation,
                           /* CheckBBLivenessOnly */ false, DepClassTy::OPTIONAL,
                           /* CheckForDeadStore */ true))
         continue;
@@ -2939,6 +2989,33 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
           continue;
       }
 
+      if (auto *FI = dyn_cast<FenceInst>(&I)) {
+        if (!ED.EncounteredNonLocalSideEffect) {
+          // An aligned fence without non-local side-effects is a no-op.
+          if (ED.IsReachedFromAlignedBarrierOnly)
+            continue;
+          // A non-aligned fence without non-local side-effects is a no-op
+          // if the ordering only publishes non-local side-effects (or less).
+          switch (FI->getOrdering()) {
+          case AtomicOrdering::NotAtomic:
+            continue;
+          case AtomicOrdering::Unordered:
+            continue;
+          case AtomicOrdering::Monotonic:
+            continue;
+          case AtomicOrdering::Acquire:
+            break;
+          case AtomicOrdering::Release:
+            continue;
+          case AtomicOrdering::AcquireRelease:
+            break;
+          case AtomicOrdering::SequentiallyConsistent:
+            break;
+          };
+        }
+        NonNoOpFences.insert(FI);
+      }
+
       auto *CB = dyn_cast<CallBase>(&I);
       bool IsNoSync = AA::isNoSyncInst(A, I, *this);
       bool IsAlignedBarrier =
@@ -2946,14 +3023,16 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
           AANoSync::isAlignedBarrier(*CB, AlignedBarrierLastInBlock);
 
       AlignedBarrierLastInBlock &= IsNoSync;
+      IsExplicitlyAligned &= IsNoSync;
 
       // Next we check for calls. Aligned barriers are handled
       // explicitly, everything else is kept for the backward traversal and will
       // also affect our state.
       if (CB) {
         if (IsAlignedBarrier) {
-          HandleAlignedBarrier(CB, ED);
+          HandleAlignedBarrier(*CB, ED);
           AlignedBarrierLastInBlock = true;
+          IsExplicitlyAligned = true;
           continue;
         }
 
@@ -2971,20 +3050,20 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
 
         // Record how we entered the call, then accumulate the effect of the
         // call in ED for potential use by the callee.
-        auto &CallED = CEDMap[CB];
-        mergeInPredecessor(A, CallED, ED);
+        auto &CallInED = CEDMap[{CB, PRE}];
+        Changed |= mergeInPredecessor(A, CallInED, ED);
 
         // If we have a sync-definition we can check if it starts/ends in an
         // aligned barrier. If we are unsure we assume any sync breaks
         // alignment.
         Function *Callee = CB->getCalledFunction();
         if (!IsNoSync && Callee && !Callee->isDeclaration()) {
-          const auto &EDAA = A.getAAFor<AAExecutionDomain>(
+          const auto *EDAA = A.getAAFor<AAExecutionDomain>(
               *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
-          if (EDAA.getState().isValidState()) {
-            const auto &CalleeED = EDAA.getFunctionExecutionDomain();
+          if (EDAA && EDAA->getState().isValidState()) {
+            const auto &CalleeED = EDAA->getFunctionExecutionDomain();
             ED.IsReachedFromAlignedBarrierOnly =
-                CalleeED.IsReachedFromAlignedBarrierOnly;
+                    CalleeED.IsReachedFromAlignedBarrierOnly;
             AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
             if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
               ED.EncounteredNonLocalSideEffect |=
@@ -2992,19 +3071,27 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
             else
               ED.EncounteredNonLocalSideEffect =
                   CalleeED.EncounteredNonLocalSideEffect;
-            if (!CalleeED.IsReachingAlignedBarrierOnly)
+            if (!CalleeED.IsReachingAlignedBarrierOnly) {
+              Changed |=
+                  setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
               SyncInstWorklist.push_back(&I);
+            }
             if (CalleeED.IsReachedFromAlignedBarrierOnly)
               mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED);
+            auto &CallOutED = CEDMap[{CB, POST}];
+            Changed |= mergeInPredecessor(A, CallOutED, ED);
             continue;
           }
         }
-        ED.IsReachedFromAlignedBarrierOnly =
-            IsNoSync && ED.IsReachedFromAlignedBarrierOnly;
+        if (!IsNoSync) {
+          ED.IsReachedFromAlignedBarrierOnly = false;
+          Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
+          SyncInstWorklist.push_back(&I);
+        }
         AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
         ED.EncounteredNonLocalSideEffect |= !CB->doesNotAccessMemory();
-        if (!IsNoSync)
-          SyncInstWorklist.push_back(&I);
+        auto &CallOutED = CEDMap[{CB, POST}];
+        Changed |= mergeInPredecessor(A, CallOutED, ED);
       }
 
       if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
@@ -3013,7 +3100,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
       // If we have a callee we try to use fine-grained information to
       // determine local side-effects.
       if (CB) {
-        const auto &MemAA = A.getAAFor<AAMemoryLocation>(
+        const auto *MemAA = A.getAAFor<AAMemoryLocation>(
             *this, IRPosition::callsite_function(*CB), DepClassTy::OPTIONAL);
 
         auto AccessPred = [&](const Instruction *I, const Value *Ptr,
@@ -3021,13 +3108,14 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
                               AAMemoryLocation::MemoryLocationsKind) {
           return !AA::isPotentiallyAffectedByBarrier(A, {Ptr}, *this, I);
         };
-        if (MemAA.getState().isValidState() &&
-            MemAA.checkForAllAccessesToMemoryKind(
+        if (MemAA && MemAA->getState().isValidState() &&
+            MemAA->checkForAllAccessesToMemoryKind(
                 AccessPred, AAMemoryLocation::ALL_LOCATIONS))
           continue;
       }
 
-      if (!I.mayHaveSideEffects() && OMPInfoCache.isOnlyUsedByAssume(I))
+      auto &InfoCache = A.getInfoCache();
+      if (!I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(I))
         continue;
 
       if (auto *LI = dyn_cast<LoadInst>(&I))
@@ -3039,18 +3127,28 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
         ED.EncounteredNonLocalSideEffect = true;
     }
 
+    bool IsEndAndNotReachingAlignedBarriersOnly = false;
     if (!isa<UnreachableInst>(BB.getTerminator()) &&
         !BB.getTerminator()->getNumSuccessors()) {
 
-      auto &FnED = BEDMap[nullptr];
-      mergeInPredecessor(A, FnED, ED);
+      Changed |= mergeInPredecessor(A, InterProceduralED, ED);
 
-      if (IsKernel)
-        HandleAlignedBarrier(nullptr, ED);
+      auto &FnED = BEDMap[nullptr];
+      if (IsKernel && !IsExplicitlyAligned)
+        FnED.IsReachingAlignedBarrierOnly = false;
+      Changed |= mergeInPredecessor(A, FnED, ED);
+
+      if (!FnED.IsReachingAlignedBarrierOnly) {
+        IsEndAndNotReachingAlignedBarriersOnly = true;
+        SyncInstWorklist.push_back(BB.getTerminator());
+        auto &BBED = BEDMap[&BB];
+        Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly, false);
+      }
     }
 
     ExecutionDomainTy &StoredED = BEDMap[&BB];
-    ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly;
+    ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
+                                      !IsEndAndNotReachingAlignedBarriersOnly;
 
     // Check if we computed anything different as part of the forward
     // traversal. We do not take assumptions and aligned barriers into account
@@ -3074,36 +3172,38 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
   while (!SyncInstWorklist.empty()) {
     Instruction *SyncInst = SyncInstWorklist.pop_back_val();
     Instruction *CurInst = SyncInst;
-    bool HitAlignedBarrier = false;
+    bool HitAlignedBarrierOrKnownEnd = false;
     while ((CurInst = CurInst->getPrevNode())) {
       auto *CB = dyn_cast<CallBase>(CurInst);
       if (!CB)
         continue;
-      auto &CallED = CEDMap[CB];
-      if (SetAndRecord(CallED.IsReachingAlignedBarrierOnly, false))
-        Changed = true;
-      HitAlignedBarrier = AlignedBarriers.count(CB);
-      if (HitAlignedBarrier)
+      auto &CallOutED = CEDMap[{CB, POST}];
+      Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly, false);
+      auto &CallInED = CEDMap[{CB, PRE}];
+      HitAlignedBarrierOrKnownEnd =
+          AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
+      if (HitAlignedBarrierOrKnownEnd)
         break;
+      Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
     }
-    if (HitAlignedBarrier)
+    if (HitAlignedBarrierOrKnownEnd)
       continue;
     BasicBlock *SyncBB = SyncInst->getParent();
     for (auto *PredBB : predecessors(SyncBB)) {
-      if (LivenessAA.isEdgeDead(PredBB, SyncBB))
+      if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
         continue;
       if (!Visited.insert(PredBB))
         continue;
-      SyncInstWorklist.push_back(PredBB->getTerminator());
       auto &PredED = BEDMap[PredBB];
-      if (SetAndRecord(PredED.IsReachingAlignedBarrierOnly, false))
+      if (setAndRecord(PredED.IsReachingAlignedBarrierOnly, false)) {
         Changed = true;
+        SyncInstWorklist.push_back(PredBB->getTerminator());
+      }
     }
     if (SyncBB != &EntryBB)
       continue;
-    auto &FnED = BEDMap[nullptr];
-    if (SetAndRecord(FnED.IsReachingAlignedBarrierOnly, false))
-      Changed = true;
+    Changed |=
+        setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly, false);
   }
 
   return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
@@ -3146,7 +3246,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
   AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
       : AAHeapToShared(IRP, A) {}
 
-  const std::string getAsStr() const override {
+  const std::string getAsStr(Attributor *) const override {
     return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
            " malloc calls eligible.";
   }
@@ -3261,7 +3361,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
       auto *SharedMem = new GlobalVariable(
           *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
-          UndefValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr,
+          PoisonValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr,
           GlobalValue::NotThreadLocal,
           static_cast<unsigned>(AddressSpace::Shared));
       auto *NewBuffer =
@@ -3270,7 +3370,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       auto Remark = [&](OptimizationRemark OR) {
         return OR << "Replaced globalized variable with "
                   << ore::NV("SharedMemory", AllocSize->getZExtValue())
-                  << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ")
+                  << (AllocSize->isOne() ? " byte " : " bytes ")
                   << "of shared memory.";
       };
       A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
@@ -3278,7 +3378,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       MaybeAlign Alignment = CB->getRetAlign();
       assert(Alignment &&
              "HeapToShared on allocation without alignment attribute");
-      SharedMem->setAlignment(MaybeAlign(Alignment));
+      SharedMem->setAlignment(*Alignment);
 
       A.changeAfterManifest(IRPosition::callsite_returned(*CB), *NewBuffer);
       A.deleteAfterManifest(*CB);
@@ -3315,9 +3415,9 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
           MallocCalls.remove(CB);
           continue;
         }
-        const auto &ED = A.getAAFor<AAExecutionDomain>(
+        const auto *ED = A.getAAFor<AAExecutionDomain>(
             *this, IRPosition::function(*F), DepClassTy::REQUIRED);
-        if (!ED.isExecutedByInitialThreadOnly(*CB))
+        if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
           MallocCalls.remove(CB);
       }
     }
@@ -3346,7 +3446,7 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
   void trackStatistics() const override {}
 
   /// See AbstractAttribute::getAsStr()
-  const std::string getAsStr() const override {
+  const std::string getAsStr(Attributor *) const override {
     if (!isValidState())
       return "<invalid>";
     return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
@@ -3456,22 +3556,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
     Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
         [&](const IRPosition &IRP, const AbstractAttribute *AA,
             bool &UsedAssumedInformation) -> std::optional<Value *> {
-      // IRP represents the "use generic state machine" argument of an
-      // __kmpc_target_init call. We will answer this one with the internal
-      // state. As long as we are not in an invalid state, we will create a
-      // custom state machine so the value should be a `i1 false`. If we are
-      // in an invalid state, we won't change the value that is in the IR.
-      if (!ReachedKnownParallelRegions.isValidState())
-        return nullptr;
-      // If we have disabled state machine rewrites, don't make a custom one.
-      if (DisableOpenMPOptStateMachineRewrite)
         return nullptr;
-      if (AA)
-        A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
-      UsedAssumedInformation = !isAtFixpoint();
-      auto *FalseVal =
-          ConstantInt::getBool(IRP.getAnchorValue().getContext(), false);
-      return FalseVal;
     };
 
     Attributor::SimplifictionCallbackTy ModeSimplifyCB =
@@ -3622,10 +3707,11 @@ struct AAKernelInfoFunction : AAKernelInfo {
     Function *Kernel = getAnchorScope();
     Module &M = *Kernel->getParent();
     Type *Int8Ty = Type::getInt8Ty(M.getContext());
-    new GlobalVariable(M, Int8Ty, /* isConstant */ true,
-                       GlobalValue::WeakAnyLinkage,
-                       ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0),
-                       Kernel->getName() + "_nested_parallelism");
+    auto *GV = new GlobalVariable(
+        M, Int8Ty, /* isConstant */ true, GlobalValue::WeakAnyLinkage,
+        ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0),
+        Kernel->getName() + "_nested_parallelism");
+    GV->setVisibility(GlobalValue::HiddenVisibility);
 
     // If we can we change the execution mode to SPMD-mode otherwise we build a
     // custom state machine.
@@ -3914,6 +4000,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
   bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
     auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
 
+    // We cannot change to SPMD mode if the runtime functions aren't availible.
+    if (!OMPInfoCache.runtimeFnsAvailable(
+            {OMPRTL___kmpc_get_hardware_thread_id_in_block,
+             OMPRTL___kmpc_barrier_simple_spmd}))
+      return false;
+
     if (!SPMDCompatibilityTracker.isAssumed()) {
       for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
         if (!NonCompatibleI)
@@ -3951,7 +4043,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
       auto *CB = cast<CallBase>(Kernel->user_back());
       Kernel = CB->getCaller();
     }
-    assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!");
+    assert(omp::isKernel(*Kernel) && "Expected kernel function!");
 
     // Check if the kernel is already in SPMD mode, if so, return success.
     GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
@@ -4021,6 +4113,13 @@ struct AAKernelInfoFunction : AAKernelInfo {
     if (!ReachedKnownParallelRegions.isValidState())
       return ChangeStatus::UNCHANGED;
 
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    if (!OMPInfoCache.runtimeFnsAvailable(
+            {OMPRTL___kmpc_get_hardware_num_threads_in_block,
+             OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
+             OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
+      return ChangeStatus::UNCHANGED;
+
     const int InitModeArgNo = 1;
     const int InitUseStateMachineArgNo = 2;
 
@@ -4167,7 +4266,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
     BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB);
 
     Module &M = *Kernel->getParent();
-    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
     FunctionCallee BlockHwSizeFn =
         OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
             M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
@@ -4220,10 +4318,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
     if (WorkFnAI->getType()->getPointerAddressSpace() !=
         (unsigned int)AddressSpace::Generic) {
       WorkFnAI = new AddrSpaceCastInst(
-          WorkFnAI,
-          PointerType::getWithSamePointeeType(
-              cast<PointerType>(WorkFnAI->getType()),
-              (unsigned int)AddressSpace::Generic),
+          WorkFnAI, PointerType::get(Ctx, (unsigned int)AddressSpace::Generic),
           WorkFnAI->getName() + ".generic", StateMachineBeginBB);
       WorkFnAI->setDebugLoc(DLoc);
     }
@@ -4345,19 +4440,20 @@ struct AAKernelInfoFunction : AAKernelInfo {
       if (!I.mayWriteToMemory())
         return true;
       if (auto *SI = dyn_cast<StoreInst>(&I)) {
-        const auto &UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>(
+        const auto *UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>(
             *this, IRPosition::value(*SI->getPointerOperand()),
             DepClassTy::OPTIONAL);
-        auto &HS = A.getAAFor<AAHeapToStack>(
+        auto *HS = A.getAAFor<AAHeapToStack>(
             *this, IRPosition::function(*I.getFunction()),
             DepClassTy::OPTIONAL);
-        if (UnderlyingObjsAA.forallUnderlyingObjects([&](Value &Obj) {
+        if (UnderlyingObjsAA &&
+            UnderlyingObjsAA->forallUnderlyingObjects([&](Value &Obj) {
               if (AA::isAssumedThreadLocalObject(A, Obj, *this))
                 return true;
               // Check for AAHeapToStack moved objects which must not be
               // guarded.
               auto *CB = dyn_cast<CallBase>(&Obj);
-              return CB && HS.isAssumedHeapToStack(*CB);
+              return CB && HS && HS->isAssumedHeapToStack(*CB);
             }))
           return true;
       }
@@ -4392,14 +4488,14 @@ struct AAKernelInfoFunction : AAKernelInfo {
           // we cannot fix the internal spmd-zation state either.
           int SPMD = 0, Generic = 0;
           for (auto *Kernel : ReachingKernelEntries) {
-            auto &CBAA = A.getAAFor<AAKernelInfo>(
+            auto *CBAA = A.getAAFor<AAKernelInfo>(
                 *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL);
-            if (CBAA.SPMDCompatibilityTracker.isValidState() &&
-                CBAA.SPMDCompatibilityTracker.isAssumed())
+            if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
+                CBAA->SPMDCompatibilityTracker.isAssumed())
               ++SPMD;
             else
               ++Generic;
-            if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint())
+            if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
               UsedAssumedInformationFromReachingKernels = true;
           }
           if (SPMD != 0 && Generic != 0)
@@ -4413,14 +4509,16 @@ struct AAKernelInfoFunction : AAKernelInfo {
     bool AllSPMDStatesWereFixed = true;
     auto CheckCallInst = [&](Instruction &I) {
       auto &CB = cast<CallBase>(I);
-      auto &CBAA = A.getAAFor<AAKernelInfo>(
+      auto *CBAA = A.getAAFor<AAKernelInfo>(
           *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
-      getState() ^= CBAA.getState();
-      AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
+      if (!CBAA)
+        return false;
+      getState() ^= CBAA->getState();
+      AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
       AllParallelRegionStatesWereFixed &=
-          CBAA.ReachedKnownParallelRegions.isAtFixpoint();
+          CBAA->ReachedKnownParallelRegions.isAtFixpoint();
       AllParallelRegionStatesWereFixed &=
-          CBAA.ReachedUnknownParallelRegions.isAtFixpoint();
+          CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
       return true;
     };
 
@@ -4460,10 +4558,10 @@ private:
 
       assert(Caller && "Caller is nullptr");
 
-      auto &CAA = A.getOrCreateAAFor<AAKernelInfo>(
+      auto *CAA = A.getOrCreateAAFor<AAKernelInfo>(
           IRPosition::function(*Caller), this, DepClassTy::REQUIRED);
-      if (CAA.ReachingKernelEntries.isValidState()) {
-        ReachingKernelEntries ^= CAA.ReachingKernelEntries;
+      if (CAA && CAA->ReachingKernelEntries.isValidState()) {
+        ReachingKernelEntries ^= CAA->ReachingKernelEntries;
         return true;
       }
 
@@ -4491,9 +4589,9 @@ private:
 
       assert(Caller && "Caller is nullptr");
 
-      auto &CAA =
+      auto *CAA =
           A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller));
-      if (CAA.ParallelLevels.isValidState()) {
+      if (CAA && CAA->ParallelLevels.isValidState()) {
         // Any function that is called by `__kmpc_parallel_51` will not be
         // folded as the parallel level in the function is updated. In order to
         // get it right, all the analysis would depend on the implentation. That
@@ -4504,7 +4602,7 @@ private:
           return true;
         }
 
-        ParallelLevels ^= CAA.ParallelLevels;
+        ParallelLevels ^= CAA->ParallelLevels;
 
         return true;
       }
@@ -4538,11 +4636,11 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     CallBase &CB = cast<CallBase>(getAssociatedValue());
     Function *Callee = getAssociatedFunction();
 
-    auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>(
+    auto *AssumptionAA = A.getAAFor<AAAssumptionInfo>(
         *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
 
     // Check for SPMD-mode assumptions.
-    if (AssumptionAA.hasAssumption("ompx_spmd_amenable")) {
+    if (AssumptionAA && AssumptionAA->hasAssumption("ompx_spmd_amenable")) {
       SPMDCompatibilityTracker.indicateOptimisticFixpoint();
       indicateOptimisticFixpoint();
     }
@@ -4567,8 +4665,9 @@ struct AAKernelInfoCallSite : AAKernelInfo {
 
         // Unknown callees might contain parallel regions, except if they have
         // an appropriate assumption attached.
-        if (!(AssumptionAA.hasAssumption("omp_no_openmp") ||
-              AssumptionAA.hasAssumption("omp_no_parallelism")))
+        if (!AssumptionAA ||
+            !(AssumptionAA->hasAssumption("omp_no_openmp") ||
+              AssumptionAA->hasAssumption("omp_no_parallelism")))
           ReachedUnknownParallelRegions.insert(&CB);
 
         // If SPMDCompatibilityTracker is not fixed, we need to give up on the
@@ -4643,11 +4742,11 @@ struct AAKernelInfoCallSite : AAKernelInfo {
               CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
         ReachedKnownParallelRegions.insert(ParallelRegion);
         /// Check nested parallelism
-        auto &FnAA = A.getAAFor<AAKernelInfo>(
+        auto *FnAA = A.getAAFor<AAKernelInfo>(
             *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL);
-        NestedParallelism |= !FnAA.getState().isValidState() ||
-                             !FnAA.ReachedKnownParallelRegions.empty() ||
-                             !FnAA.ReachedUnknownParallelRegions.empty();
+        NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
+                             !FnAA->ReachedKnownParallelRegions.empty() ||
+                             !FnAA->ReachedUnknownParallelRegions.empty();
         break;
       }
       // The condition above should usually get the parallel region function
@@ -4691,10 +4790,12 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     // If F is not a runtime function, propagate the AAKernelInfo of the callee.
     if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
       const IRPosition &FnPos = IRPosition::function(*F);
-      auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
-      if (getState() == FnAA.getState())
+      auto *FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
+      if (!FnAA)
+        return indicatePessimisticFixpoint();
+      if (getState() == FnAA->getState())
         return ChangeStatus::UNCHANGED;
-      getState() = FnAA.getState();
+      getState() = FnAA->getState();
       return ChangeStatus::CHANGED;
     }
 
@@ -4707,9 +4808,9 @@ struct AAKernelInfoCallSite : AAKernelInfo {
 
     CallBase &CB = cast<CallBase>(getAssociatedValue());
 
-    auto &HeapToStackAA = A.getAAFor<AAHeapToStack>(
+    auto *HeapToStackAA = A.getAAFor<AAHeapToStack>(
         *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
-    auto &HeapToSharedAA = A.getAAFor<AAHeapToShared>(
+    auto *HeapToSharedAA = A.getAAFor<AAHeapToShared>(
         *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
 
     RuntimeFunction RF = It->getSecond();
@@ -4718,13 +4819,15 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     // If neither HeapToStack nor HeapToShared assume the call is removed,
     // assume SPMD incompatibility.
     case OMPRTL___kmpc_alloc_shared:
-      if (!HeapToStackAA.isAssumedHeapToStack(CB) &&
-          !HeapToSharedAA.isAssumedHeapToShared(CB))
+      if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
+          (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
         SPMDCompatibilityTracker.insert(&CB);
       break;
     case OMPRTL___kmpc_free_shared:
-      if (!HeapToStackAA.isAssumedHeapToStackRemovedFree(CB) &&
-          !HeapToSharedAA.isAssumedHeapToSharedRemovedFree(CB))
+      if ((!HeapToStackAA ||
+           !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
+          (!HeapToSharedAA ||
+           !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
         SPMDCompatibilityTracker.insert(&CB);
       break;
     default:
@@ -4770,7 +4873,7 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
       : AAFoldRuntimeCall(IRP, A) {}
 
   /// See AbstractAttribute::getAsStr()
-  const std::string getAsStr() const override {
+  const std::string getAsStr(Attributor *) const override {
     if (!isValidState())
       return "<invalid>";
 
@@ -4883,28 +4986,29 @@ private:
 
     unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
     unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
-    auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
+    auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
         *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
 
-    if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
+    if (!CallerKernelInfoAA ||
+        !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
       return indicatePessimisticFixpoint();
 
-    for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
-      auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
+    for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
+      auto *AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
                                           DepClassTy::REQUIRED);
 
-      if (!AA.isValidState()) {
+      if (!AA || !AA->isValidState()) {
         SimplifiedValue = nullptr;
         return indicatePessimisticFixpoint();
       }
 
-      if (AA.SPMDCompatibilityTracker.isAssumed()) {
-        if (AA.SPMDCompatibilityTracker.isAtFixpoint())
+      if (AA->SPMDCompatibilityTracker.isAssumed()) {
+        if (AA->SPMDCompatibilityTracker.isAtFixpoint())
           ++KnownSPMDCount;
         else
           ++AssumedSPMDCount;
       } else {
-        if (AA.SPMDCompatibilityTracker.isAtFixpoint())
+        if (AA->SPMDCompatibilityTracker.isAtFixpoint())
           ++KnownNonSPMDCount;
         else
           ++AssumedNonSPMDCount;
@@ -4943,16 +5047,17 @@ private:
   ChangeStatus foldParallelLevel(Attributor &A) {
     std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
 
-    auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
+    auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
         *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
 
-    if (!CallerKernelInfoAA.ParallelLevels.isValidState())
+    if (!CallerKernelInfoAA ||
+        !CallerKernelInfoAA->ParallelLevels.isValidState())
       return indicatePessimisticFixpoint();
 
-    if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
+    if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
       return indicatePessimisticFixpoint();
 
-    if (CallerKernelInfoAA.ReachingKernelEntries.empty()) {
+    if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
       assert(!SimplifiedValue &&
              "SimplifiedValue should keep none at this point");
       return ChangeStatus::UNCHANGED;
@@ -4960,19 +5065,19 @@ private:
 
     unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
     unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
-    for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
-      auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
+    for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
+      auto *AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
                                           DepClassTy::REQUIRED);
-      if (!AA.SPMDCompatibilityTracker.isValidState())
+      if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
         return indicatePessimisticFixpoint();
 
-      if (AA.SPMDCompatibilityTracker.isAssumed()) {
-        if (AA.SPMDCompatibilityTracker.isAtFixpoint())
+      if (AA->SPMDCompatibilityTracker.isAssumed()) {
+        if (AA->SPMDCompatibilityTracker.isAtFixpoint())
           ++KnownSPMDCount;
         else
           ++AssumedSPMDCount;
       } else {
-        if (AA.SPMDCompatibilityTracker.isAtFixpoint())
+        if (AA->SPMDCompatibilityTracker.isAtFixpoint())
           ++KnownNonSPMDCount;
         else
           ++AssumedNonSPMDCount;
@@ -5005,14 +5110,15 @@ private:
     int32_t CurrentAttrValue = -1;
     std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
 
-    auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
+    auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
         *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
 
-    if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
+    if (!CallerKernelInfoAA ||
+        !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
       return indicatePessimisticFixpoint();
 
     // Iterate over the kernels that reach this function
-    for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
+    for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
       int32_t NextAttrVal = K->getFnAttributeAsParsedInteger(Attr, -1);
 
       if (NextAttrVal == -1 ||
@@ -5135,6 +5241,8 @@ void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {
   A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F));
   if (!DisableOpenMPOptDeglobalization)
     A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(F));
+  if (F.hasFnAttribute(Attribute::Convergent))
+    A.getOrCreateAAFor<AANonConvergent>(IRPosition::function(F));
 
   for (auto &I : instructions(F)) {
     if (auto *LI = dyn_cast<LoadInst>(&I)) {
@@ -5147,6 +5255,10 @@ void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {
       A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
       continue;
     }
+    if (auto *FI = dyn_cast<FenceInst>(&I)) {
+      A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*FI));
+      continue;
+    }
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       if (II->getIntrinsicID() == Intrinsic::assume) {
         A.getOrCreateAAFor<AAPotentialValues>(
@@ -5304,6 +5416,8 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
     });
   };
 
+  bool Changed = false;
+
   // Create internal copies of each function if this is a kernel Module. This
   // allows iterprocedural passes to see every call edge.
   DenseMap<Function *, Function *> InternalizedMap;
@@ -5319,7 +5433,8 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
         }
       }
 
-    Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
+    Changed |=
+        Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
   }
 
   // Look at every function in the Module unless it was internalized.
@@ -5332,7 +5447,7 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
     }
 
   if (SCC.empty())
-    return PreservedAnalyses::all();
+    return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 
   AnalysisGetter AG(FAM);
 
@@ -5343,7 +5458,9 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
   BumpPtrAllocator Allocator;
   CallGraphUpdater CGUpdater;
 
-  OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, Kernels);
+  bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
+                  LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
+  OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, PostLink);
 
   unsigned MaxFixpointIterations =
       (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
@@ -5356,11 +5473,14 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
   AC.OREGetter = OREGetter;
   AC.PassName = DEBUG_TYPE;
   AC.InitializationCallback = OpenMPOpt::registerAAsForFunction;
+  AC.IPOAmendableCB = [](const Function &F) {
+    return F.hasFnAttribute("kernel");
+  };
 
   Attributor A(Functions, InfoCache, AC);
 
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
-  bool Changed = OMPOpt.run(true);
+  Changed |= OMPOpt.run(true);
 
   // Optionally inline device functions for potentially better performance.
   if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M))
@@ -5417,9 +5537,11 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
   CallGraphUpdater CGUpdater;
   CGUpdater.initialize(CG, C, AM, UR);
 
+  bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
+                  LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
   SetVector<Function *> Functions(SCC.begin(), SCC.end());
   OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
-                                /*CGSCC*/ &Functions, Kernels);
+                                /*CGSCC*/ &Functions, PostLink);
 
   unsigned MaxFixpointIterations =
       (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
@@ -5447,6 +5569,8 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
   return PreservedAnalyses::all();
 }
 
+bool llvm::omp::isKernel(Function &Fn) { return Fn.hasFnAttribute("kernel"); }
+
 KernelSet llvm::omp::getDeviceKernels(Module &M) {
   // TODO: Create a more cross-platform way of determining device kernels.
   NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
@@ -5467,6 +5591,7 @@ KernelSet llvm::omp::getDeviceKernels(Module &M) {
     if (!KernelFn)
       continue;
 
+    assert(isKernel(*KernelFn) && "Inconsistent kernel function annotation");
     ++NumOpenMPTargetRegionKernels;
 
     Kernels.insert(KernelFn);