aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/IPO/OpenMPOpt.cpp')
-rw-r--r--llvm/lib/Transforms/IPO/OpenMPOpt.cpp699
1 files changed, 412 insertions, 287 deletions
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index bee154dab10f..588f3901e3cb 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -22,8 +22,10 @@
#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
@@ -36,6 +38,8 @@
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instruction.h"
@@ -44,7 +48,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/LLVMContext.h"
-#include "llvm/InitializePasses.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/IPO/Attributor.h"
@@ -188,9 +192,9 @@ struct AAICVTracker;
struct OMPInformationCache : public InformationCache {
OMPInformationCache(Module &M, AnalysisGetter &AG,
BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC,
- KernelSet &Kernels)
+ bool OpenMPPostLink)
: InformationCache(M, AG, Allocator, CGSCC), OMPBuilder(M),
- Kernels(Kernels) {
+ OpenMPPostLink(OpenMPPostLink) {
OMPBuilder.initialize();
initializeRuntimeFunctions(M);
@@ -417,7 +421,7 @@ struct OMPInformationCache : public InformationCache {
// TODO: We directly convert uses into proper calls and unknown uses.
for (Use &U : RFI.Declaration->uses()) {
if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
- if (ModuleSlice.empty() || ModuleSlice.count(UserI->getFunction())) {
+ if (!CGSCC || CGSCC->empty() || CGSCC->contains(UserI->getFunction())) {
RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
++NumUses;
}
@@ -448,6 +452,24 @@ struct OMPInformationCache : public InformationCache {
CI->setCallingConv(Fn->getCallingConv());
}
+ // Helper function to determine if it's legal to create a call to the runtime
+ // functions.
+ bool runtimeFnsAvailable(ArrayRef<RuntimeFunction> Fns) {
+ // We can always emit calls if we haven't yet linked in the runtime.
+ if (!OpenMPPostLink)
+ return true;
+
+ // Once the runtime has been already been linked in we cannot emit calls to
+ // any undefined functions.
+ for (RuntimeFunction Fn : Fns) {
+ RuntimeFunctionInfo &RFI = RFIs[Fn];
+
+ if (RFI.Declaration && RFI.Declaration->isDeclaration())
+ return false;
+ }
+ return true;
+ }
+
/// Helper to initialize all runtime function information for those defined
/// in OpenMPKinds.def.
void initializeRuntimeFunctions(Module &M) {
@@ -518,11 +540,11 @@ struct OMPInformationCache : public InformationCache {
// TODO: We should attach the attributes defined in OMPKinds.def.
}
- /// Collection of known kernels (\see Kernel) in the module.
- KernelSet &Kernels;
-
/// Collection of known OpenMP runtime functions..
DenseSet<const Function *> RTLFunctions;
+
+ /// Indicates if we have already linked in the OpenMP device library.
+ bool OpenMPPostLink = false;
};
template <typename Ty, bool InsertInvalidates = true>
@@ -808,7 +830,7 @@ struct OpenMPOpt {
return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
}
- /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
+ /// Run all OpenMP optimizations on the underlying SCC.
bool run(bool IsModulePass) {
if (SCC.empty())
return false;
@@ -816,8 +838,7 @@ struct OpenMPOpt {
bool Changed = false;
LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
- << " functions in a slice with "
- << OMPInfoCache.ModuleSlice.size() << " functions\n");
+ << " functions\n");
if (IsModulePass) {
Changed |= runAttributor(IsModulePass);
@@ -882,7 +903,7 @@ struct OpenMPOpt {
/// Print OpenMP GPU kernels for testing.
void printKernels() const {
for (Function *F : SCC) {
- if (!OMPInfoCache.Kernels.count(F))
+ if (!omp::isKernel(*F))
continue;
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
@@ -1412,7 +1433,10 @@ private:
Changed |= WasSplit;
return WasSplit;
};
- RFI.foreachUse(SCC, SplitMemTransfers);
+ if (OMPInfoCache.runtimeFnsAvailable(
+ {OMPRTL___tgt_target_data_begin_mapper_issue,
+ OMPRTL___tgt_target_data_begin_mapper_wait}))
+ RFI.foreachUse(SCC, SplitMemTransfers);
return Changed;
}
@@ -1681,37 +1705,27 @@ private:
};
if (!ReplVal) {
- for (Use *U : *UV)
+ auto *DT =
+ OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(F);
+ if (!DT)
+ return false;
+ Instruction *IP = nullptr;
+ for (Use *U : *UV) {
if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
+ if (IP)
+ IP = DT->findNearestCommonDominator(IP, CI);
+ else
+ IP = CI;
if (!CanBeMoved(*CI))
continue;
-
- // If the function is a kernel, dedup will move
- // the runtime call right after the kernel init callsite. Otherwise,
- // it will move it to the beginning of the caller function.
- if (isKernel(F)) {
- auto &KernelInitRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
- auto *KernelInitUV = KernelInitRFI.getUseVector(F);
-
- if (KernelInitUV->empty())
- continue;
-
- assert(KernelInitUV->size() == 1 &&
- "Expected a single __kmpc_target_init in kernel\n");
-
- CallInst *KernelInitCI =
- getCallIfRegularCall(*KernelInitUV->front(), &KernelInitRFI);
- assert(KernelInitCI &&
- "Expected a call to __kmpc_target_init in kernel\n");
-
- CI->moveAfter(KernelInitCI);
- } else
- CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
- ReplVal = CI;
- break;
+ if (!ReplVal)
+ ReplVal = CI;
}
+ }
if (!ReplVal)
return false;
+ assert(IP && "Expected insertion point!");
+ cast<Instruction>(ReplVal)->moveBefore(IP);
}
// If we use a call as a replacement value we need to make sure the ident is
@@ -1809,9 +1823,6 @@ private:
///
///{{
- /// Check if \p F is a kernel, hence entry point for target offloading.
- bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
-
/// Cache to remember the unique kernel for a function.
DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
@@ -1920,7 +1931,8 @@ public:
};
Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
- if (!OMPInfoCache.ModuleSlice.empty() && !OMPInfoCache.ModuleSlice.count(&F))
+ if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
+ !OMPInfoCache.CGSCC->contains(&F))
return nullptr;
// Use a scope to keep the lifetime of the CachedKernel short.
@@ -2095,12 +2107,6 @@ struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
using Base = StateWrapper<BooleanState, AbstractAttribute>;
AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
- void initialize(Attributor &A) override {
- Function *F = getAnchorScope();
- if (!F || !A.isFunctionIPOAmendable(*F))
- indicatePessimisticFixpoint();
- }
-
/// Returns true if value is assumed to be tracked.
bool isAssumedTracked() const { return getAssumed(); }
@@ -2146,7 +2152,9 @@ struct AAICVTrackerFunction : public AAICVTracker {
: AAICVTracker(IRP, A) {}
// FIXME: come up with better string.
- const std::string getAsStr() const override { return "ICVTrackerFunction"; }
+ const std::string getAsStr(Attributor *) const override {
+ return "ICVTrackerFunction";
+ }
// FIXME: come up with some stats.
void trackStatistics() const override {}
@@ -2242,11 +2250,12 @@ struct AAICVTrackerFunction : public AAICVTracker {
if (CalledFunction->isDeclaration())
return nullptr;
- const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+ const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
*this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
- if (ICVTrackingAA.isAssumedTracked()) {
- std::optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV);
+ if (ICVTrackingAA->isAssumedTracked()) {
+ std::optional<Value *> URV =
+ ICVTrackingAA->getUniqueReplacementValue(ICV);
if (!URV || (*URV && AA::isValidAtPosition(AA::ValueAndContext(**URV, I),
OMPInfoCache)))
return URV;
@@ -2337,7 +2346,7 @@ struct AAICVTrackerFunctionReturned : AAICVTracker {
: AAICVTracker(IRP, A) {}
// FIXME: come up with better string.
- const std::string getAsStr() const override {
+ const std::string getAsStr(Attributor *) const override {
return "ICVTrackerFunctionReturned";
}
@@ -2362,10 +2371,10 @@ struct AAICVTrackerFunctionReturned : AAICVTracker {
ChangeStatus updateImpl(Attributor &A) override {
ChangeStatus Changed = ChangeStatus::UNCHANGED;
- const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+ const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
*this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
- if (!ICVTrackingAA.isAssumedTracked())
+ if (!ICVTrackingAA->isAssumedTracked())
return indicatePessimisticFixpoint();
for (InternalControlVar ICV : TrackableICVs) {
@@ -2374,7 +2383,7 @@ struct AAICVTrackerFunctionReturned : AAICVTracker {
auto CheckReturnInst = [&](Instruction &I) {
std::optional<Value *> NewReplVal =
- ICVTrackingAA.getReplacementValue(ICV, &I, A);
+ ICVTrackingAA->getReplacementValue(ICV, &I, A);
// If we found a second ICV value there is no unique returned value.
if (UniqueICVValue && UniqueICVValue != NewReplVal)
@@ -2407,9 +2416,7 @@ struct AAICVTrackerCallSite : AAICVTracker {
: AAICVTracker(IRP, A) {}
void initialize(Attributor &A) override {
- Function *F = getAnchorScope();
- if (!F || !A.isFunctionIPOAmendable(*F))
- indicatePessimisticFixpoint();
+ assert(getAnchorScope() && "Expected anchor function");
// We only initialize this AA for getters, so we need to know which ICV it
// gets.
@@ -2438,7 +2445,9 @@ struct AAICVTrackerCallSite : AAICVTracker {
}
// FIXME: come up with better string.
- const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
+ const std::string getAsStr(Attributor *) const override {
+ return "ICVTrackerCallSite";
+ }
// FIXME: come up with some stats.
void trackStatistics() const override {}
@@ -2447,15 +2456,15 @@ struct AAICVTrackerCallSite : AAICVTracker {
std::optional<Value *> ReplVal;
ChangeStatus updateImpl(Attributor &A) override {
- const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+ const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
*this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
// We don't have any information, so we assume it changes the ICV.
- if (!ICVTrackingAA.isAssumedTracked())
+ if (!ICVTrackingAA->isAssumedTracked())
return indicatePessimisticFixpoint();
std::optional<Value *> NewReplVal =
- ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
+ ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(), A);
if (ReplVal == NewReplVal)
return ChangeStatus::UNCHANGED;
@@ -2477,7 +2486,7 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker {
: AAICVTracker(IRP, A) {}
// FIXME: come up with better string.
- const std::string getAsStr() const override {
+ const std::string getAsStr(Attributor *) const override {
return "ICVTrackerCallSiteReturned";
}
@@ -2503,18 +2512,18 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker {
ChangeStatus updateImpl(Attributor &A) override {
ChangeStatus Changed = ChangeStatus::UNCHANGED;
- const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
+ const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
*this, IRPosition::returned(*getAssociatedFunction()),
DepClassTy::REQUIRED);
// We don't have any information, so we assume it changes the ICV.
- if (!ICVTrackingAA.isAssumedTracked())
+ if (!ICVTrackingAA->isAssumedTracked())
return indicatePessimisticFixpoint();
for (InternalControlVar ICV : TrackableICVs) {
std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
std::optional<Value *> NewReplVal =
- ICVTrackingAA.getUniqueReplacementValue(ICV);
+ ICVTrackingAA->getUniqueReplacementValue(ICV);
if (ReplVal == NewReplVal)
continue;
@@ -2530,26 +2539,28 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
: AAExecutionDomain(IRP, A) {}
- ~AAExecutionDomainFunction() {
- delete RPOT;
- }
+ ~AAExecutionDomainFunction() { delete RPOT; }
void initialize(Attributor &A) override {
- if (getAnchorScope()->isDeclaration()) {
- indicatePessimisticFixpoint();
- return;
- }
- RPOT = new ReversePostOrderTraversal<Function *>(getAnchorScope());
+ Function *F = getAnchorScope();
+ assert(F && "Expected anchor function");
+ RPOT = new ReversePostOrderTraversal<Function *>(F);
}
- const std::string getAsStr() const override {
- unsigned TotalBlocks = 0, InitialThreadBlocks = 0;
+ const std::string getAsStr(Attributor *) const override {
+ unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
for (auto &It : BEDMap) {
+ if (!It.getFirst())
+ continue;
TotalBlocks++;
InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
+ AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
+ It.getSecond().IsReachingAlignedBarrierOnly;
}
return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) + "/" +
- std::to_string(TotalBlocks) + " executed by initial thread only";
+ std::to_string(AlignedBlocks) + " of " +
+ std::to_string(TotalBlocks) +
+ " executed by initial thread / aligned";
}
/// See AbstractAttribute::trackStatistics().
@@ -2572,7 +2583,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
SmallPtrSet<CallBase *, 16> DeletedBarriers;
auto HandleAlignedBarrier = [&](CallBase *CB) {
- const ExecutionDomainTy &ED = CEDMap[CB];
+ const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[nullptr];
if (!ED.IsReachedFromAlignedBarrierOnly ||
ED.EncounteredNonLocalSideEffect)
return;
@@ -2596,6 +2607,8 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
CallBase *LastCB = Worklist.pop_back_val();
if (!Visited.insert(LastCB))
continue;
+ if (LastCB->getFunction() != getAnchorScope())
+ continue;
if (!DeletedBarriers.count(LastCB)) {
A.deleteAfterManifest(*LastCB);
continue;
@@ -2603,7 +2616,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
// The final aligned barrier (LastCB) reaching the kernel end was
// removed already. This means we can go one step further and remove
// the barriers encoutered last before (LastCB).
- const ExecutionDomainTy &LastED = CEDMap[LastCB];
+ const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
Worklist.append(LastED.AlignedBarriers.begin(),
LastED.AlignedBarriers.end());
}
@@ -2619,14 +2632,17 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
for (auto *CB : AlignedBarriers)
HandleAlignedBarrier(CB);
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
// Handle the "kernel end barrier" for kernels too.
- if (OMPInfoCache.Kernels.count(getAnchorScope()))
+ if (omp::isKernel(*getAnchorScope()))
HandleAlignedBarrier(nullptr);
return Changed;
}
+ bool isNoOpFence(const FenceInst &FI) const override {
+ return getState().isValidState() && !NonNoOpFences.count(&FI);
+ }
+
/// Merge barrier and assumption information from \p PredED into the successor
/// \p ED.
void
@@ -2636,12 +2652,12 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
/// Merge all information from \p PredED into the successor \p ED. If
/// \p InitialEdgeOnly is set, only the initial edge will enter the block
/// represented by \p ED from this predecessor.
- void mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,
+ bool mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
bool InitialEdgeOnly = false);
/// Accumulate information for the entry block in \p EntryBBED.
- void handleEntryBB(Attributor &A, ExecutionDomainTy &EntryBBED);
+ bool handleCallees(Attributor &A, ExecutionDomainTy &EntryBBED);
/// See AbstractAttribute::updateImpl.
ChangeStatus updateImpl(Attributor &A) override;
@@ -2651,14 +2667,18 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
if (!isValidState())
return false;
+ assert(BB.getParent() == getAnchorScope() && "Block is out of scope!");
return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
}
bool isExecutedInAlignedRegion(Attributor &A,
const Instruction &I) const override {
- if (!isValidState() || isa<CallBase>(I))
+ assert(I.getFunction() == getAnchorScope() &&
+ "Instruction is out of scope!");
+ if (!isValidState())
return false;
+ bool ForwardIsOk = true;
const Instruction *CurI;
// Check forward until a call or the block end is reached.
@@ -2667,15 +2687,18 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
auto *CB = dyn_cast<CallBase>(CurI);
if (!CB)
continue;
- const auto &It = CEDMap.find(CB);
+ if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB)))
+ return true;
+ const auto &It = CEDMap.find({CB, PRE});
if (It == CEDMap.end())
continue;
- if (!It->getSecond().IsReachedFromAlignedBarrierOnly)
- return false;
+ if (!It->getSecond().IsReachingAlignedBarrierOnly)
+ ForwardIsOk = false;
+ break;
} while ((CurI = CurI->getNextNonDebugInstruction()));
- if (!CurI && !BEDMap.lookup(I.getParent()).IsReachedFromAlignedBarrierOnly)
- return false;
+ if (!CurI && !BEDMap.lookup(I.getParent()).IsReachingAlignedBarrierOnly)
+ ForwardIsOk = false;
// Check backward until a call or the block beginning is reached.
CurI = &I;
@@ -2683,33 +2706,30 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
auto *CB = dyn_cast<CallBase>(CurI);
if (!CB)
continue;
- const auto &It = CEDMap.find(CB);
+ if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB)))
+ return true;
+ const auto &It = CEDMap.find({CB, POST});
if (It == CEDMap.end())
continue;
- if (!AA::isNoSyncInst(A, *CB, *this)) {
- if (It->getSecond().IsReachedFromAlignedBarrierOnly)
- break;
- return false;
- }
-
- Function *Callee = CB->getCalledFunction();
- if (!Callee || Callee->isDeclaration())
- return false;
- const auto &EDAA = A.getAAFor<AAExecutionDomain>(
- *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
- if (!EDAA.getState().isValidState())
- return false;
- if (!EDAA.getFunctionExecutionDomain().IsReachedFromAlignedBarrierOnly)
- return false;
- break;
+ if (It->getSecond().IsReachedFromAlignedBarrierOnly)
+ break;
+ return false;
} while ((CurI = CurI->getPrevNonDebugInstruction()));
- if (!CurI &&
- !llvm::all_of(
- predecessors(I.getParent()), [&](const BasicBlock *PredBB) {
- return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
- })) {
+ // Delayed decision on the forward pass to allow aligned barrier detection
+ // in the backwards traversal.
+ if (!ForwardIsOk)
return false;
+
+ if (!CurI) {
+ const BasicBlock *BB = I.getParent();
+ if (BB == &BB->getParent()->getEntryBlock())
+ return BEDMap.lookup(nullptr).IsReachedFromAlignedBarrierOnly;
+ if (!llvm::all_of(predecessors(BB), [&](const BasicBlock *PredBB) {
+ return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
+ })) {
+ return false;
+ }
}
// On neither traversal we found a anything but aligned barriers.
@@ -2721,15 +2741,16 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
"No request should be made against an invalid state!");
return BEDMap.lookup(&BB);
}
- ExecutionDomainTy getExecutionDomain(const CallBase &CB) const override {
+ std::pair<ExecutionDomainTy, ExecutionDomainTy>
+ getExecutionDomain(const CallBase &CB) const override {
assert(isValidState() &&
"No request should be made against an invalid state!");
- return CEDMap.lookup(&CB);
+ return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
}
ExecutionDomainTy getFunctionExecutionDomain() const override {
assert(isValidState() &&
"No request should be made against an invalid state!");
- return BEDMap.lookup(nullptr);
+ return InterProceduralED;
}
///}
@@ -2778,12 +2799,28 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
return false;
};
+ /// Mapping containing information about the function for other AAs.
+ ExecutionDomainTy InterProceduralED;
+
+ enum Direction { PRE = 0, POST = 1 };
/// Mapping containing information per block.
DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
- DenseMap<const CallBase *, ExecutionDomainTy> CEDMap;
+ DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy>
+ CEDMap;
SmallSetVector<CallBase *, 16> AlignedBarriers;
ReversePostOrderTraversal<Function *> *RPOT = nullptr;
+
+ /// Set \p R to \V and report true if that changed \p R.
+ static bool setAndRecord(bool &R, bool V) {
+ bool Eq = (R == V);
+ R = V;
+ return !Eq;
+ }
+
+ /// Collection of fences known to be non-no-opt. All fences not in this set
+ /// can be assumed no-opt.
+ SmallPtrSet<const FenceInst *, 8> NonNoOpFences;
};
void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
@@ -2795,62 +2832,82 @@ void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
ED.addAlignedBarrier(A, *AB);
}
-void AAExecutionDomainFunction::mergeInPredecessor(
+bool AAExecutionDomainFunction::mergeInPredecessor(
Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,
bool InitialEdgeOnly) {
- ED.IsExecutedByInitialThreadOnly =
- InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
- ED.IsExecutedByInitialThreadOnly);
-
- ED.IsReachedFromAlignedBarrierOnly = ED.IsReachedFromAlignedBarrierOnly &&
- PredED.IsReachedFromAlignedBarrierOnly;
- ED.EncounteredNonLocalSideEffect =
- ED.EncounteredNonLocalSideEffect | PredED.EncounteredNonLocalSideEffect;
+
+ bool Changed = false;
+ Changed |=
+ setAndRecord(ED.IsExecutedByInitialThreadOnly,
+ InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
+ ED.IsExecutedByInitialThreadOnly));
+
+ Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
+ ED.IsReachedFromAlignedBarrierOnly &&
+ PredED.IsReachedFromAlignedBarrierOnly);
+ Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
+ ED.EncounteredNonLocalSideEffect |
+ PredED.EncounteredNonLocalSideEffect);
+ // Do not track assumptions and barriers as part of Changed.
if (ED.IsReachedFromAlignedBarrierOnly)
mergeInPredecessorBarriersAndAssumptions(A, ED, PredED);
else
ED.clearAssumeInstAndAlignedBarriers();
+ return Changed;
}
-void AAExecutionDomainFunction::handleEntryBB(Attributor &A,
+bool AAExecutionDomainFunction::handleCallees(Attributor &A,
ExecutionDomainTy &EntryBBED) {
- SmallVector<ExecutionDomainTy> PredExecDomains;
+ SmallVector<std::pair<ExecutionDomainTy, ExecutionDomainTy>, 4> CallSiteEDs;
auto PredForCallSite = [&](AbstractCallSite ACS) {
- const auto &EDAA = A.getAAFor<AAExecutionDomain>(
+ const auto *EDAA = A.getAAFor<AAExecutionDomain>(
*this, IRPosition::function(*ACS.getInstruction()->getFunction()),
DepClassTy::OPTIONAL);
- if (!EDAA.getState().isValidState())
+ if (!EDAA || !EDAA->getState().isValidState())
return false;
- PredExecDomains.emplace_back(
- EDAA.getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
+ CallSiteEDs.emplace_back(
+ EDAA->getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
return true;
};
+ ExecutionDomainTy ExitED;
bool AllCallSitesKnown;
if (A.checkForAllCallSites(PredForCallSite, *this,
/* RequiresAllCallSites */ true,
AllCallSitesKnown)) {
- for (const auto &PredED : PredExecDomains)
- mergeInPredecessor(A, EntryBBED, PredED);
+ for (const auto &[CSInED, CSOutED] : CallSiteEDs) {
+ mergeInPredecessor(A, EntryBBED, CSInED);
+ ExitED.IsReachingAlignedBarrierOnly &=
+ CSOutED.IsReachingAlignedBarrierOnly;
+ }
} else {
// We could not find all predecessors, so this is either a kernel or a
// function with external linkage (or with some other weird uses).
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- if (OMPInfoCache.Kernels.count(getAnchorScope())) {
+ if (omp::isKernel(*getAnchorScope())) {
EntryBBED.IsExecutedByInitialThreadOnly = false;
EntryBBED.IsReachedFromAlignedBarrierOnly = true;
EntryBBED.EncounteredNonLocalSideEffect = false;
+ ExitED.IsReachingAlignedBarrierOnly = true;
} else {
EntryBBED.IsExecutedByInitialThreadOnly = false;
EntryBBED.IsReachedFromAlignedBarrierOnly = false;
EntryBBED.EncounteredNonLocalSideEffect = true;
+ ExitED.IsReachingAlignedBarrierOnly = false;
}
}
+ bool Changed = false;
auto &FnED = BEDMap[nullptr];
- FnED.IsReachingAlignedBarrierOnly &=
- EntryBBED.IsReachedFromAlignedBarrierOnly;
+ Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
+ FnED.IsReachedFromAlignedBarrierOnly &
+ EntryBBED.IsReachedFromAlignedBarrierOnly);
+ Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
+ FnED.IsReachingAlignedBarrierOnly &
+ ExitED.IsReachingAlignedBarrierOnly);
+ Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
+ EntryBBED.IsExecutedByInitialThreadOnly);
+ return Changed;
}
ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
@@ -2860,36 +2917,28 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
// Helper to deal with an aligned barrier encountered during the forward
// traversal. \p CB is the aligned barrier, \p ED is the execution domain when
// it was encountered.
- auto HandleAlignedBarrier = [&](CallBase *CB, ExecutionDomainTy &ED) {
- if (CB)
- Changed |= AlignedBarriers.insert(CB);
+ auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {
+ Changed |= AlignedBarriers.insert(&CB);
// First, update the barrier ED kept in the separate CEDMap.
- auto &CallED = CEDMap[CB];
- mergeInPredecessor(A, CallED, ED);
+ auto &CallInED = CEDMap[{&CB, PRE}];
+ Changed |= mergeInPredecessor(A, CallInED, ED);
+ CallInED.IsReachingAlignedBarrierOnly = true;
// Next adjust the ED we use for the traversal.
ED.EncounteredNonLocalSideEffect = false;
ED.IsReachedFromAlignedBarrierOnly = true;
// Aligned barrier collection has to come last.
ED.clearAssumeInstAndAlignedBarriers();
- if (CB)
- ED.addAlignedBarrier(A, *CB);
+ ED.addAlignedBarrier(A, CB);
+ auto &CallOutED = CEDMap[{&CB, POST}];
+ Changed |= mergeInPredecessor(A, CallOutED, ED);
};
- auto &LivenessAA =
+ auto *LivenessAA =
A.getAAFor<AAIsDead>(*this, getIRPosition(), DepClassTy::OPTIONAL);
- // Set \p R to \V and report true if that changed \p R.
- auto SetAndRecord = [&](bool &R, bool V) {
- bool Eq = (R == V);
- R = V;
- return !Eq;
- };
-
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
-
Function *F = getAnchorScope();
BasicBlock &EntryBB = F->getEntryBlock();
- bool IsKernel = OMPInfoCache.Kernels.count(F);
+ bool IsKernel = omp::isKernel(*F);
SmallVector<Instruction *> SyncInstWorklist;
for (auto &RIt : *RPOT) {
@@ -2899,18 +2948,19 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
// TODO: We use local reasoning since we don't have a divergence analysis
// running as well. We could basically allow uniform branches here.
bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
+ bool IsExplicitlyAligned = IsEntryBB && IsKernel;
ExecutionDomainTy ED;
// Propagate "incoming edges" into information about this block.
if (IsEntryBB) {
- handleEntryBB(A, ED);
+ Changed |= handleCallees(A, ED);
} else {
// For live non-entry blocks we only propagate
// information via live edges.
- if (LivenessAA.isAssumedDead(&BB))
+ if (LivenessAA && LivenessAA->isAssumedDead(&BB))
continue;
for (auto *PredBB : predecessors(&BB)) {
- if (LivenessAA.isEdgeDead(PredBB, &BB))
+ if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
continue;
bool InitialEdgeOnly = isInitialThreadOnlyEdge(
A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
@@ -2922,7 +2972,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
// information to calls.
for (Instruction &I : BB) {
bool UsedAssumedInformation;
- if (A.isAssumedDead(I, *this, &LivenessAA, UsedAssumedInformation,
+ if (A.isAssumedDead(I, *this, LivenessAA, UsedAssumedInformation,
/* CheckBBLivenessOnly */ false, DepClassTy::OPTIONAL,
/* CheckForDeadStore */ true))
continue;
@@ -2939,6 +2989,33 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
continue;
}
+ if (auto *FI = dyn_cast<FenceInst>(&I)) {
+ if (!ED.EncounteredNonLocalSideEffect) {
+ // An aligned fence without non-local side-effects is a no-op.
+ if (ED.IsReachedFromAlignedBarrierOnly)
+ continue;
+ // A non-aligned fence without non-local side-effects is a no-op
+ // if the ordering only publishes non-local side-effects (or less).
+ switch (FI->getOrdering()) {
+ case AtomicOrdering::NotAtomic:
+ continue;
+ case AtomicOrdering::Unordered:
+ continue;
+ case AtomicOrdering::Monotonic:
+ continue;
+ case AtomicOrdering::Acquire:
+ break;
+ case AtomicOrdering::Release:
+ continue;
+ case AtomicOrdering::AcquireRelease:
+ break;
+ case AtomicOrdering::SequentiallyConsistent:
+ break;
+ };
+ }
+ NonNoOpFences.insert(FI);
+ }
+
auto *CB = dyn_cast<CallBase>(&I);
bool IsNoSync = AA::isNoSyncInst(A, I, *this);
bool IsAlignedBarrier =
@@ -2946,14 +3023,16 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
AANoSync::isAlignedBarrier(*CB, AlignedBarrierLastInBlock);
AlignedBarrierLastInBlock &= IsNoSync;
+ IsExplicitlyAligned &= IsNoSync;
// Next we check for calls. Aligned barriers are handled
// explicitly, everything else is kept for the backward traversal and will
// also affect our state.
if (CB) {
if (IsAlignedBarrier) {
- HandleAlignedBarrier(CB, ED);
+ HandleAlignedBarrier(*CB, ED);
AlignedBarrierLastInBlock = true;
+ IsExplicitlyAligned = true;
continue;
}
@@ -2971,20 +3050,20 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
// Record how we entered the call, then accumulate the effect of the
// call in ED for potential use by the callee.
- auto &CallED = CEDMap[CB];
- mergeInPredecessor(A, CallED, ED);
+ auto &CallInED = CEDMap[{CB, PRE}];
+ Changed |= mergeInPredecessor(A, CallInED, ED);
// If we have a sync-definition we can check if it starts/ends in an
// aligned barrier. If we are unsure we assume any sync breaks
// alignment.
Function *Callee = CB->getCalledFunction();
if (!IsNoSync && Callee && !Callee->isDeclaration()) {
- const auto &EDAA = A.getAAFor<AAExecutionDomain>(
+ const auto *EDAA = A.getAAFor<AAExecutionDomain>(
*this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
- if (EDAA.getState().isValidState()) {
- const auto &CalleeED = EDAA.getFunctionExecutionDomain();
+ if (EDAA && EDAA->getState().isValidState()) {
+ const auto &CalleeED = EDAA->getFunctionExecutionDomain();
ED.IsReachedFromAlignedBarrierOnly =
- CalleeED.IsReachedFromAlignedBarrierOnly;
+ CalleeED.IsReachedFromAlignedBarrierOnly;
AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
ED.EncounteredNonLocalSideEffect |=
@@ -2992,19 +3071,27 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
else
ED.EncounteredNonLocalSideEffect =
CalleeED.EncounteredNonLocalSideEffect;
- if (!CalleeED.IsReachingAlignedBarrierOnly)
+ if (!CalleeED.IsReachingAlignedBarrierOnly) {
+ Changed |=
+ setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
SyncInstWorklist.push_back(&I);
+ }
if (CalleeED.IsReachedFromAlignedBarrierOnly)
mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED);
+ auto &CallOutED = CEDMap[{CB, POST}];
+ Changed |= mergeInPredecessor(A, CallOutED, ED);
continue;
}
}
- ED.IsReachedFromAlignedBarrierOnly =
- IsNoSync && ED.IsReachedFromAlignedBarrierOnly;
+ if (!IsNoSync) {
+ ED.IsReachedFromAlignedBarrierOnly = false;
+ Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
+ SyncInstWorklist.push_back(&I);
+ }
AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
ED.EncounteredNonLocalSideEffect |= !CB->doesNotAccessMemory();
- if (!IsNoSync)
- SyncInstWorklist.push_back(&I);
+ auto &CallOutED = CEDMap[{CB, POST}];
+ Changed |= mergeInPredecessor(A, CallOutED, ED);
}
if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
@@ -3013,7 +3100,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
// If we have a callee we try to use fine-grained information to
// determine local side-effects.
if (CB) {
- const auto &MemAA = A.getAAFor<AAMemoryLocation>(
+ const auto *MemAA = A.getAAFor<AAMemoryLocation>(
*this, IRPosition::callsite_function(*CB), DepClassTy::OPTIONAL);
auto AccessPred = [&](const Instruction *I, const Value *Ptr,
@@ -3021,13 +3108,14 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
AAMemoryLocation::MemoryLocationsKind) {
return !AA::isPotentiallyAffectedByBarrier(A, {Ptr}, *this, I);
};
- if (MemAA.getState().isValidState() &&
- MemAA.checkForAllAccessesToMemoryKind(
+ if (MemAA && MemAA->getState().isValidState() &&
+ MemAA->checkForAllAccessesToMemoryKind(
AccessPred, AAMemoryLocation::ALL_LOCATIONS))
continue;
}
- if (!I.mayHaveSideEffects() && OMPInfoCache.isOnlyUsedByAssume(I))
+ auto &InfoCache = A.getInfoCache();
+ if (!I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(I))
continue;
if (auto *LI = dyn_cast<LoadInst>(&I))
@@ -3039,18 +3127,28 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
ED.EncounteredNonLocalSideEffect = true;
}
+ bool IsEndAndNotReachingAlignedBarriersOnly = false;
if (!isa<UnreachableInst>(BB.getTerminator()) &&
!BB.getTerminator()->getNumSuccessors()) {
- auto &FnED = BEDMap[nullptr];
- mergeInPredecessor(A, FnED, ED);
+ Changed |= mergeInPredecessor(A, InterProceduralED, ED);
- if (IsKernel)
- HandleAlignedBarrier(nullptr, ED);
+ auto &FnED = BEDMap[nullptr];
+ if (IsKernel && !IsExplicitlyAligned)
+ FnED.IsReachingAlignedBarrierOnly = false;
+ Changed |= mergeInPredecessor(A, FnED, ED);
+
+ if (!FnED.IsReachingAlignedBarrierOnly) {
+ IsEndAndNotReachingAlignedBarriersOnly = true;
+ SyncInstWorklist.push_back(BB.getTerminator());
+ auto &BBED = BEDMap[&BB];
+ Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly, false);
+ }
}
ExecutionDomainTy &StoredED = BEDMap[&BB];
- ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly;
+ ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
+ !IsEndAndNotReachingAlignedBarriersOnly;
// Check if we computed anything different as part of the forward
// traversal. We do not take assumptions and aligned barriers into account
@@ -3074,36 +3172,38 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
while (!SyncInstWorklist.empty()) {
Instruction *SyncInst = SyncInstWorklist.pop_back_val();
Instruction *CurInst = SyncInst;
- bool HitAlignedBarrier = false;
+ bool HitAlignedBarrierOrKnownEnd = false;
while ((CurInst = CurInst->getPrevNode())) {
auto *CB = dyn_cast<CallBase>(CurInst);
if (!CB)
continue;
- auto &CallED = CEDMap[CB];
- if (SetAndRecord(CallED.IsReachingAlignedBarrierOnly, false))
- Changed = true;
- HitAlignedBarrier = AlignedBarriers.count(CB);
- if (HitAlignedBarrier)
+ auto &CallOutED = CEDMap[{CB, POST}];
+ Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly, false);
+ auto &CallInED = CEDMap[{CB, PRE}];
+ HitAlignedBarrierOrKnownEnd =
+ AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
+ if (HitAlignedBarrierOrKnownEnd)
break;
+ Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
}
- if (HitAlignedBarrier)
+ if (HitAlignedBarrierOrKnownEnd)
continue;
BasicBlock *SyncBB = SyncInst->getParent();
for (auto *PredBB : predecessors(SyncBB)) {
- if (LivenessAA.isEdgeDead(PredBB, SyncBB))
+ if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
continue;
if (!Visited.insert(PredBB))
continue;
- SyncInstWorklist.push_back(PredBB->getTerminator());
auto &PredED = BEDMap[PredBB];
- if (SetAndRecord(PredED.IsReachingAlignedBarrierOnly, false))
+ if (setAndRecord(PredED.IsReachingAlignedBarrierOnly, false)) {
Changed = true;
+ SyncInstWorklist.push_back(PredBB->getTerminator());
+ }
}
if (SyncBB != &EntryBB)
continue;
- auto &FnED = BEDMap[nullptr];
- if (SetAndRecord(FnED.IsReachingAlignedBarrierOnly, false))
- Changed = true;
+ Changed |=
+ setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly, false);
}
return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
@@ -3146,7 +3246,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
: AAHeapToShared(IRP, A) {}
- const std::string getAsStr() const override {
+ const std::string getAsStr(Attributor *) const override {
return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
" malloc calls eligible.";
}
@@ -3261,7 +3361,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
auto *SharedMem = new GlobalVariable(
*M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
- UndefValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr,
+ PoisonValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr,
GlobalValue::NotThreadLocal,
static_cast<unsigned>(AddressSpace::Shared));
auto *NewBuffer =
@@ -3270,7 +3370,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
auto Remark = [&](OptimizationRemark OR) {
return OR << "Replaced globalized variable with "
<< ore::NV("SharedMemory", AllocSize->getZExtValue())
- << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ")
+ << (AllocSize->isOne() ? " byte " : " bytes ")
<< "of shared memory.";
};
A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
@@ -3278,7 +3378,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
MaybeAlign Alignment = CB->getRetAlign();
assert(Alignment &&
"HeapToShared on allocation without alignment attribute");
- SharedMem->setAlignment(MaybeAlign(Alignment));
+ SharedMem->setAlignment(*Alignment);
A.changeAfterManifest(IRPosition::callsite_returned(*CB), *NewBuffer);
A.deleteAfterManifest(*CB);
@@ -3315,9 +3415,9 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
MallocCalls.remove(CB);
continue;
}
- const auto &ED = A.getAAFor<AAExecutionDomain>(
+ const auto *ED = A.getAAFor<AAExecutionDomain>(
*this, IRPosition::function(*F), DepClassTy::REQUIRED);
- if (!ED.isExecutedByInitialThreadOnly(*CB))
+ if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
MallocCalls.remove(CB);
}
}
@@ -3346,7 +3446,7 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
void trackStatistics() const override {}
/// See AbstractAttribute::getAsStr()
- const std::string getAsStr() const override {
+ const std::string getAsStr(Attributor *) const override {
if (!isValidState())
return "<invalid>";
return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
@@ -3456,22 +3556,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
[&](const IRPosition &IRP, const AbstractAttribute *AA,
bool &UsedAssumedInformation) -> std::optional<Value *> {
- // IRP represents the "use generic state machine" argument of an
- // __kmpc_target_init call. We will answer this one with the internal
- // state. As long as we are not in an invalid state, we will create a
- // custom state machine so the value should be a `i1 false`. If we are
- // in an invalid state, we won't change the value that is in the IR.
- if (!ReachedKnownParallelRegions.isValidState())
- return nullptr;
- // If we have disabled state machine rewrites, don't make a custom one.
- if (DisableOpenMPOptStateMachineRewrite)
return nullptr;
- if (AA)
- A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
- UsedAssumedInformation = !isAtFixpoint();
- auto *FalseVal =
- ConstantInt::getBool(IRP.getAnchorValue().getContext(), false);
- return FalseVal;
};
Attributor::SimplifictionCallbackTy ModeSimplifyCB =
@@ -3622,10 +3707,11 @@ struct AAKernelInfoFunction : AAKernelInfo {
Function *Kernel = getAnchorScope();
Module &M = *Kernel->getParent();
Type *Int8Ty = Type::getInt8Ty(M.getContext());
- new GlobalVariable(M, Int8Ty, /* isConstant */ true,
- GlobalValue::WeakAnyLinkage,
- ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0),
- Kernel->getName() + "_nested_parallelism");
+ auto *GV = new GlobalVariable(
+ M, Int8Ty, /* isConstant */ true, GlobalValue::WeakAnyLinkage,
+ ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0),
+ Kernel->getName() + "_nested_parallelism");
+ GV->setVisibility(GlobalValue::HiddenVisibility);
// If we can we change the execution mode to SPMD-mode otherwise we build a
// custom state machine.
@@ -3914,6 +4000,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+ // We cannot change to SPMD mode if the runtime functions aren't availible.
+ if (!OMPInfoCache.runtimeFnsAvailable(
+ {OMPRTL___kmpc_get_hardware_thread_id_in_block,
+ OMPRTL___kmpc_barrier_simple_spmd}))
+ return false;
+
if (!SPMDCompatibilityTracker.isAssumed()) {
for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
if (!NonCompatibleI)
@@ -3951,7 +4043,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
auto *CB = cast<CallBase>(Kernel->user_back());
Kernel = CB->getCaller();
}
- assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!");
+ assert(omp::isKernel(*Kernel) && "Expected kernel function!");
// Check if the kernel is already in SPMD mode, if so, return success.
GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
@@ -4021,6 +4113,13 @@ struct AAKernelInfoFunction : AAKernelInfo {
if (!ReachedKnownParallelRegions.isValidState())
return ChangeStatus::UNCHANGED;
+ auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+ if (!OMPInfoCache.runtimeFnsAvailable(
+ {OMPRTL___kmpc_get_hardware_num_threads_in_block,
+ OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
+ OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
+ return ChangeStatus::UNCHANGED;
+
const int InitModeArgNo = 1;
const int InitUseStateMachineArgNo = 2;
@@ -4167,7 +4266,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB);
Module &M = *Kernel->getParent();
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
FunctionCallee BlockHwSizeFn =
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
@@ -4220,10 +4318,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
if (WorkFnAI->getType()->getPointerAddressSpace() !=
(unsigned int)AddressSpace::Generic) {
WorkFnAI = new AddrSpaceCastInst(
- WorkFnAI,
- PointerType::getWithSamePointeeType(
- cast<PointerType>(WorkFnAI->getType()),
- (unsigned int)AddressSpace::Generic),
+ WorkFnAI, PointerType::get(Ctx, (unsigned int)AddressSpace::Generic),
WorkFnAI->getName() + ".generic", StateMachineBeginBB);
WorkFnAI->setDebugLoc(DLoc);
}
@@ -4345,19 +4440,20 @@ struct AAKernelInfoFunction : AAKernelInfo {
if (!I.mayWriteToMemory())
return true;
if (auto *SI = dyn_cast<StoreInst>(&I)) {
- const auto &UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>(
+ const auto *UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>(
*this, IRPosition::value(*SI->getPointerOperand()),
DepClassTy::OPTIONAL);
- auto &HS = A.getAAFor<AAHeapToStack>(
+ auto *HS = A.getAAFor<AAHeapToStack>(
*this, IRPosition::function(*I.getFunction()),
DepClassTy::OPTIONAL);
- if (UnderlyingObjsAA.forallUnderlyingObjects([&](Value &Obj) {
+ if (UnderlyingObjsAA &&
+ UnderlyingObjsAA->forallUnderlyingObjects([&](Value &Obj) {
if (AA::isAssumedThreadLocalObject(A, Obj, *this))
return true;
// Check for AAHeapToStack moved objects which must not be
// guarded.
auto *CB = dyn_cast<CallBase>(&Obj);
- return CB && HS.isAssumedHeapToStack(*CB);
+ return CB && HS && HS->isAssumedHeapToStack(*CB);
}))
return true;
}
@@ -4392,14 +4488,14 @@ struct AAKernelInfoFunction : AAKernelInfo {
// we cannot fix the internal spmd-zation state either.
int SPMD = 0, Generic = 0;
for (auto *Kernel : ReachingKernelEntries) {
- auto &CBAA = A.getAAFor<AAKernelInfo>(
+ auto *CBAA = A.getAAFor<AAKernelInfo>(
*this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL);
- if (CBAA.SPMDCompatibilityTracker.isValidState() &&
- CBAA.SPMDCompatibilityTracker.isAssumed())
+ if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
+ CBAA->SPMDCompatibilityTracker.isAssumed())
++SPMD;
else
++Generic;
- if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint())
+ if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
UsedAssumedInformationFromReachingKernels = true;
}
if (SPMD != 0 && Generic != 0)
@@ -4413,14 +4509,16 @@ struct AAKernelInfoFunction : AAKernelInfo {
bool AllSPMDStatesWereFixed = true;
auto CheckCallInst = [&](Instruction &I) {
auto &CB = cast<CallBase>(I);
- auto &CBAA = A.getAAFor<AAKernelInfo>(
+ auto *CBAA = A.getAAFor<AAKernelInfo>(
*this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
- getState() ^= CBAA.getState();
- AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
+ if (!CBAA)
+ return false;
+ getState() ^= CBAA->getState();
+ AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
AllParallelRegionStatesWereFixed &=
- CBAA.ReachedKnownParallelRegions.isAtFixpoint();
+ CBAA->ReachedKnownParallelRegions.isAtFixpoint();
AllParallelRegionStatesWereFixed &=
- CBAA.ReachedUnknownParallelRegions.isAtFixpoint();
+ CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
return true;
};
@@ -4460,10 +4558,10 @@ private:
assert(Caller && "Caller is nullptr");
- auto &CAA = A.getOrCreateAAFor<AAKernelInfo>(
+ auto *CAA = A.getOrCreateAAFor<AAKernelInfo>(
IRPosition::function(*Caller), this, DepClassTy::REQUIRED);
- if (CAA.ReachingKernelEntries.isValidState()) {
- ReachingKernelEntries ^= CAA.ReachingKernelEntries;
+ if (CAA && CAA->ReachingKernelEntries.isValidState()) {
+ ReachingKernelEntries ^= CAA->ReachingKernelEntries;
return true;
}
@@ -4491,9 +4589,9 @@ private:
assert(Caller && "Caller is nullptr");
- auto &CAA =
+ auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller));
- if (CAA.ParallelLevels.isValidState()) {
+ if (CAA && CAA->ParallelLevels.isValidState()) {
// Any function that is called by `__kmpc_parallel_51` will not be
// folded as the parallel level in the function is updated. In order to
// get it right, all the analysis would depend on the implentation. That
@@ -4504,7 +4602,7 @@ private:
return true;
}
- ParallelLevels ^= CAA.ParallelLevels;
+ ParallelLevels ^= CAA->ParallelLevels;
return true;
}
@@ -4538,11 +4636,11 @@ struct AAKernelInfoCallSite : AAKernelInfo {
CallBase &CB = cast<CallBase>(getAssociatedValue());
Function *Callee = getAssociatedFunction();
- auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>(
+ auto *AssumptionAA = A.getAAFor<AAAssumptionInfo>(
*this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
// Check for SPMD-mode assumptions.
- if (AssumptionAA.hasAssumption("ompx_spmd_amenable")) {
+ if (AssumptionAA && AssumptionAA->hasAssumption("ompx_spmd_amenable")) {
SPMDCompatibilityTracker.indicateOptimisticFixpoint();
indicateOptimisticFixpoint();
}
@@ -4567,8 +4665,9 @@ struct AAKernelInfoCallSite : AAKernelInfo {
// Unknown callees might contain parallel regions, except if they have
// an appropriate assumption attached.
- if (!(AssumptionAA.hasAssumption("omp_no_openmp") ||
- AssumptionAA.hasAssumption("omp_no_parallelism")))
+ if (!AssumptionAA ||
+ !(AssumptionAA->hasAssumption("omp_no_openmp") ||
+ AssumptionAA->hasAssumption("omp_no_parallelism")))
ReachedUnknownParallelRegions.insert(&CB);
// If SPMDCompatibilityTracker is not fixed, we need to give up on the
@@ -4643,11 +4742,11 @@ struct AAKernelInfoCallSite : AAKernelInfo {
CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
ReachedKnownParallelRegions.insert(ParallelRegion);
/// Check nested parallelism
- auto &FnAA = A.getAAFor<AAKernelInfo>(
+ auto *FnAA = A.getAAFor<AAKernelInfo>(
*this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL);
- NestedParallelism |= !FnAA.getState().isValidState() ||
- !FnAA.ReachedKnownParallelRegions.empty() ||
- !FnAA.ReachedUnknownParallelRegions.empty();
+ NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
+ !FnAA->ReachedKnownParallelRegions.empty() ||
+ !FnAA->ReachedUnknownParallelRegions.empty();
break;
}
// The condition above should usually get the parallel region function
@@ -4691,10 +4790,12 @@ struct AAKernelInfoCallSite : AAKernelInfo {
// If F is not a runtime function, propagate the AAKernelInfo of the callee.
if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
const IRPosition &FnPos = IRPosition::function(*F);
- auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
- if (getState() == FnAA.getState())
+ auto *FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
+ if (!FnAA)
+ return indicatePessimisticFixpoint();
+ if (getState() == FnAA->getState())
return ChangeStatus::UNCHANGED;
- getState() = FnAA.getState();
+ getState() = FnAA->getState();
return ChangeStatus::CHANGED;
}
@@ -4707,9 +4808,9 @@ struct AAKernelInfoCallSite : AAKernelInfo {
CallBase &CB = cast<CallBase>(getAssociatedValue());
- auto &HeapToStackAA = A.getAAFor<AAHeapToStack>(
+ auto *HeapToStackAA = A.getAAFor<AAHeapToStack>(
*this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
- auto &HeapToSharedAA = A.getAAFor<AAHeapToShared>(
+ auto *HeapToSharedAA = A.getAAFor<AAHeapToShared>(
*this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
RuntimeFunction RF = It->getSecond();
@@ -4718,13 +4819,15 @@ struct AAKernelInfoCallSite : AAKernelInfo {
// If neither HeapToStack nor HeapToShared assume the call is removed,
// assume SPMD incompatibility.
case OMPRTL___kmpc_alloc_shared:
- if (!HeapToStackAA.isAssumedHeapToStack(CB) &&
- !HeapToSharedAA.isAssumedHeapToShared(CB))
+ if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
+ (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
SPMDCompatibilityTracker.insert(&CB);
break;
case OMPRTL___kmpc_free_shared:
- if (!HeapToStackAA.isAssumedHeapToStackRemovedFree(CB) &&
- !HeapToSharedAA.isAssumedHeapToSharedRemovedFree(CB))
+ if ((!HeapToStackAA ||
+ !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
+ (!HeapToSharedAA ||
+ !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
SPMDCompatibilityTracker.insert(&CB);
break;
default:
@@ -4770,7 +4873,7 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
: AAFoldRuntimeCall(IRP, A) {}
/// See AbstractAttribute::getAsStr()
- const std::string getAsStr() const override {
+ const std::string getAsStr(Attributor *) const override {
if (!isValidState())
return "<invalid>";
@@ -4883,28 +4986,29 @@ private:
unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
- auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
+ auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
*this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
- if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
+ if (!CallerKernelInfoAA ||
+ !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
return indicatePessimisticFixpoint();
- for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
- auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
+ for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
+ auto *AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
DepClassTy::REQUIRED);
- if (!AA.isValidState()) {
+ if (!AA || !AA->isValidState()) {
SimplifiedValue = nullptr;
return indicatePessimisticFixpoint();
}
- if (AA.SPMDCompatibilityTracker.isAssumed()) {
- if (AA.SPMDCompatibilityTracker.isAtFixpoint())
+ if (AA->SPMDCompatibilityTracker.isAssumed()) {
+ if (AA->SPMDCompatibilityTracker.isAtFixpoint())
++KnownSPMDCount;
else
++AssumedSPMDCount;
} else {
- if (AA.SPMDCompatibilityTracker.isAtFixpoint())
+ if (AA->SPMDCompatibilityTracker.isAtFixpoint())
++KnownNonSPMDCount;
else
++AssumedNonSPMDCount;
@@ -4943,16 +5047,17 @@ private:
ChangeStatus foldParallelLevel(Attributor &A) {
std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
- auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
+ auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
*this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
- if (!CallerKernelInfoAA.ParallelLevels.isValidState())
+ if (!CallerKernelInfoAA ||
+ !CallerKernelInfoAA->ParallelLevels.isValidState())
return indicatePessimisticFixpoint();
- if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
+ if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
return indicatePessimisticFixpoint();
- if (CallerKernelInfoAA.ReachingKernelEntries.empty()) {
+ if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
assert(!SimplifiedValue &&
"SimplifiedValue should keep none at this point");
return ChangeStatus::UNCHANGED;
@@ -4960,19 +5065,19 @@ private:
unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
- for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
- auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
+ for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
+ auto *AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
DepClassTy::REQUIRED);
- if (!AA.SPMDCompatibilityTracker.isValidState())
+ if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
return indicatePessimisticFixpoint();
- if (AA.SPMDCompatibilityTracker.isAssumed()) {
- if (AA.SPMDCompatibilityTracker.isAtFixpoint())
+ if (AA->SPMDCompatibilityTracker.isAssumed()) {
+ if (AA->SPMDCompatibilityTracker.isAtFixpoint())
++KnownSPMDCount;
else
++AssumedSPMDCount;
} else {
- if (AA.SPMDCompatibilityTracker.isAtFixpoint())
+ if (AA->SPMDCompatibilityTracker.isAtFixpoint())
++KnownNonSPMDCount;
else
++AssumedNonSPMDCount;
@@ -5005,14 +5110,15 @@ private:
int32_t CurrentAttrValue = -1;
std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
- auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
+ auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
*this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
- if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
+ if (!CallerKernelInfoAA ||
+ !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
return indicatePessimisticFixpoint();
// Iterate over the kernels that reach this function
- for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
+ for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
int32_t NextAttrVal = K->getFnAttributeAsParsedInteger(Attr, -1);
if (NextAttrVal == -1 ||
@@ -5135,6 +5241,8 @@ void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {
A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F));
if (!DisableOpenMPOptDeglobalization)
A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(F));
+ if (F.hasFnAttribute(Attribute::Convergent))
+ A.getOrCreateAAFor<AANonConvergent>(IRPosition::function(F));
for (auto &I : instructions(F)) {
if (auto *LI = dyn_cast<LoadInst>(&I)) {
@@ -5147,6 +5255,10 @@ void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {
A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
continue;
}
+ if (auto *FI = dyn_cast<FenceInst>(&I)) {
+ A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*FI));
+ continue;
+ }
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
if (II->getIntrinsicID() == Intrinsic::assume) {
A.getOrCreateAAFor<AAPotentialValues>(
@@ -5304,6 +5416,8 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
});
};
+ bool Changed = false;
+
// Create internal copies of each function if this is a kernel Module. This
// allows iterprocedural passes to see every call edge.
DenseMap<Function *, Function *> InternalizedMap;
@@ -5319,7 +5433,8 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
}
}
- Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
+ Changed |=
+ Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
}
// Look at every function in the Module unless it was internalized.
@@ -5332,7 +5447,7 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
}
if (SCC.empty())
- return PreservedAnalyses::all();
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
AnalysisGetter AG(FAM);
@@ -5343,7 +5458,9 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
BumpPtrAllocator Allocator;
CallGraphUpdater CGUpdater;
- OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, Kernels);
+ bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
+ LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
+ OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, PostLink);
unsigned MaxFixpointIterations =
(isOpenMPDevice(M)) ? SetFixpointIterations : 32;
@@ -5356,11 +5473,14 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
AC.OREGetter = OREGetter;
AC.PassName = DEBUG_TYPE;
AC.InitializationCallback = OpenMPOpt::registerAAsForFunction;
+ AC.IPOAmendableCB = [](const Function &F) {
+ return F.hasFnAttribute("kernel");
+ };
Attributor A(Functions, InfoCache, AC);
OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
- bool Changed = OMPOpt.run(true);
+ Changed |= OMPOpt.run(true);
// Optionally inline device functions for potentially better performance.
if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M))
@@ -5417,9 +5537,11 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
CallGraphUpdater CGUpdater;
CGUpdater.initialize(CG, C, AM, UR);
+ bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
+ LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
SetVector<Function *> Functions(SCC.begin(), SCC.end());
OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
- /*CGSCC*/ &Functions, Kernels);
+ /*CGSCC*/ &Functions, PostLink);
unsigned MaxFixpointIterations =
(isOpenMPDevice(M)) ? SetFixpointIterations : 32;
@@ -5447,6 +5569,8 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
return PreservedAnalyses::all();
}
+bool llvm::omp::isKernel(Function &Fn) { return Fn.hasFnAttribute("kernel"); }
+
KernelSet llvm::omp::getDeviceKernels(Module &M) {
// TODO: Create a more cross-platform way of determining device kernels.
NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
@@ -5467,6 +5591,7 @@ KernelSet llvm::omp::getDeviceKernels(Module &M) {
if (!KernelFn)
continue;
+ assert(isKernel(*KernelFn) && "Inconsistent kernel function annotation");
++NumOpenMPTargetRegionKernels;
Kernels.insert(KernelFn);