summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-02-11 12:38:04 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-02-11 12:38:11 +0000
commite3b557809604d036af6e00c60f012c2025b59a5e (patch)
tree8a11ba2269a3b669601e2fd41145b174008f4da8 /llvm/lib/Transforms/IPO/OpenMPOpt.cpp
parent08e8dd7b9db7bb4a9de26d44c1cbfd24e869c014 (diff)
Diffstat (limited to 'llvm/lib/Transforms/IPO/OpenMPOpt.cpp')
-rw-r--r--llvm/lib/Transforms/IPO/OpenMPOpt.cpp1621
1 files changed, 943 insertions, 678 deletions
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index ef2384faa273..bee154dab10f 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/CallGraph.h"
@@ -32,6 +33,7 @@
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
#include "llvm/IR/Assumptions.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/GlobalValue.h"
@@ -45,12 +47,13 @@
#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/Attributor.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
#include <algorithm>
+#include <optional>
+#include <string>
using namespace llvm;
using namespace omp;
@@ -71,6 +74,8 @@ static cl::opt<bool>
cl::desc("Disable function internalization."),
cl::Hidden, cl::init(false));
+static cl::opt<bool> DeduceICVValues("openmp-deduce-icv-values",
+ cl::init(false), cl::Hidden);
static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
cl::Hidden);
static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
@@ -182,13 +187,13 @@ struct AAICVTracker;
/// Attributor runs.
struct OMPInformationCache : public InformationCache {
OMPInformationCache(Module &M, AnalysisGetter &AG,
- BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
+ BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC,
KernelSet &Kernels)
- : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
+ : InformationCache(M, AG, Allocator, CGSCC), OMPBuilder(M),
Kernels(Kernels) {
OMPBuilder.initialize();
- initializeRuntimeFunctions();
+ initializeRuntimeFunctions(M);
initializeInternalControlVars();
}
@@ -412,7 +417,7 @@ struct OMPInformationCache : public InformationCache {
// TODO: We directly convert uses into proper calls and unknown uses.
for (Use &U : RFI.Declaration->uses()) {
if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
- if (ModuleSlice.count(UserI->getFunction())) {
+ if (ModuleSlice.empty() || ModuleSlice.count(UserI->getFunction())) {
RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
++NumUses;
}
@@ -445,8 +450,7 @@ struct OMPInformationCache : public InformationCache {
/// Helper to initialize all runtime function information for those defined
/// in OpenMPKinds.def.
- void initializeRuntimeFunctions() {
- Module &M = *((*ModuleSlice.begin())->getParent());
+ void initializeRuntimeFunctions(Module &M) {
// Helper macros for handling __VA_ARGS__ in OMP_RTL
#define OMP_TYPE(VarName, ...) \
@@ -499,6 +503,18 @@ struct OMPInformationCache : public InformationCache {
}
#include "llvm/Frontend/OpenMP/OMPKinds.def"
+ // Remove the `noinline` attribute from `__kmpc`, `ompx::` and `omp_`
+ // functions, except if `optnone` is present.
+ if (isOpenMPDevice(M)) {
+ for (Function &F : M) {
+ for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"})
+ if (F.hasFnAttribute(Attribute::NoInline) &&
+ F.getName().startswith(Prefix) &&
+ !F.hasFnAttribute(Attribute::OptimizeNone))
+ F.removeFnAttr(Attribute::NoInline);
+ }
+ }
+
// TODO: We should attach the attributes defined in OMPKinds.def.
}
@@ -587,6 +603,9 @@ struct KernelInfoState : AbstractState {
/// caller is __kmpc_parallel_51.
BooleanStateWithSetVector<uint8_t> ParallelLevels;
+ /// Flag that indicates if the kernel has nested Parallelism
+ bool NestedParallelism = false;
+
/// Abstract State interface
///{
@@ -605,6 +624,7 @@ struct KernelInfoState : AbstractState {
/// See AbstractState::indicatePessimisticFixpoint(...)
ChangeStatus indicatePessimisticFixpoint() override {
IsAtFixpoint = true;
+ ParallelLevels.indicatePessimisticFixpoint();
ReachingKernelEntries.indicatePessimisticFixpoint();
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
ReachedKnownParallelRegions.indicatePessimisticFixpoint();
@@ -615,6 +635,7 @@ struct KernelInfoState : AbstractState {
/// See AbstractState::indicateOptimisticFixpoint(...)
ChangeStatus indicateOptimisticFixpoint() override {
IsAtFixpoint = true;
+ ParallelLevels.indicateOptimisticFixpoint();
ReachingKernelEntries.indicateOptimisticFixpoint();
SPMDCompatibilityTracker.indicateOptimisticFixpoint();
ReachedKnownParallelRegions.indicateOptimisticFixpoint();
@@ -635,6 +656,8 @@ struct KernelInfoState : AbstractState {
return false;
if (ReachingKernelEntries != RHS.ReachingKernelEntries)
return false;
+ if (ParallelLevels != RHS.ParallelLevels)
+ return false;
return true;
}
@@ -672,6 +695,7 @@ struct KernelInfoState : AbstractState {
SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
+ NestedParallelism |= KIS.NestedParallelism;
return *this;
}
@@ -806,8 +830,6 @@ struct OpenMPOpt {
if (remarksEnabled())
analysisGlobalization();
-
- Changed |= eliminateBarriers();
} else {
if (PrintICVValues)
printICVs();
@@ -830,8 +852,6 @@ struct OpenMPOpt {
Changed = true;
}
}
-
- Changed |= eliminateBarriers();
}
return Changed;
@@ -843,7 +863,7 @@ struct OpenMPOpt {
InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
ICV_proc_bind};
- for (Function *F : OMPInfoCache.ModuleSlice) {
+ for (Function *F : SCC) {
for (auto ICV : ICVs) {
auto ICVInfo = OMPInfoCache.ICVs[ICV];
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
@@ -1397,212 +1417,6 @@ private:
return Changed;
}
- /// Eliminates redundant, aligned barriers in OpenMP offloaded kernels.
- /// TODO: Make this an AA and expand it to work across blocks and functions.
- bool eliminateBarriers() {
- bool Changed = false;
-
- if (DisableOpenMPOptBarrierElimination)
- return /*Changed=*/false;
-
- if (OMPInfoCache.Kernels.empty())
- return /*Changed=*/false;
-
- enum ImplicitBarrierType { IBT_ENTRY, IBT_EXIT };
-
- class BarrierInfo {
- Instruction *I;
- enum ImplicitBarrierType Type;
-
- public:
- BarrierInfo(enum ImplicitBarrierType Type) : I(nullptr), Type(Type) {}
- BarrierInfo(Instruction &I) : I(&I) {}
-
- bool isImplicit() { return !I; }
-
- bool isImplicitEntry() { return isImplicit() && Type == IBT_ENTRY; }
-
- bool isImplicitExit() { return isImplicit() && Type == IBT_EXIT; }
-
- Instruction *getInstruction() { return I; }
- };
-
- for (Function *Kernel : OMPInfoCache.Kernels) {
- for (BasicBlock &BB : *Kernel) {
- SmallVector<BarrierInfo, 8> BarriersInBlock;
- SmallPtrSet<Instruction *, 8> BarriersToBeDeleted;
-
- // Add the kernel entry implicit barrier.
- if (&Kernel->getEntryBlock() == &BB)
- BarriersInBlock.push_back(IBT_ENTRY);
-
- // Find implicit and explicit aligned barriers in the same basic block.
- for (Instruction &I : BB) {
- if (isa<ReturnInst>(I)) {
- // Add the implicit barrier when exiting the kernel.
- BarriersInBlock.push_back(IBT_EXIT);
- continue;
- }
- CallBase *CB = dyn_cast<CallBase>(&I);
- if (!CB)
- continue;
-
- auto IsAlignBarrierCB = [&](CallBase &CB) {
- switch (CB.getIntrinsicID()) {
- case Intrinsic::nvvm_barrier0:
- case Intrinsic::nvvm_barrier0_and:
- case Intrinsic::nvvm_barrier0_or:
- case Intrinsic::nvvm_barrier0_popc:
- return true;
- default:
- break;
- }
- return hasAssumption(CB,
- KnownAssumptionString("ompx_aligned_barrier"));
- };
-
- if (IsAlignBarrierCB(*CB)) {
- // Add an explicit aligned barrier.
- BarriersInBlock.push_back(I);
- }
- }
-
- if (BarriersInBlock.size() <= 1)
- continue;
-
- // A barrier in a barrier pair is removeable if all instructions
- // between the barriers in the pair are side-effect free modulo the
- // barrier operation.
- auto IsBarrierRemoveable = [&Kernel](BarrierInfo *StartBI,
- BarrierInfo *EndBI) {
- assert(
- !StartBI->isImplicitExit() &&
- "Expected start barrier to be other than a kernel exit barrier");
- assert(
- !EndBI->isImplicitEntry() &&
- "Expected end barrier to be other than a kernel entry barrier");
- // If StarBI instructions is null then this the implicit
- // kernel entry barrier, so iterate from the first instruction in the
- // entry block.
- Instruction *I = (StartBI->isImplicitEntry())
- ? &Kernel->getEntryBlock().front()
- : StartBI->getInstruction()->getNextNode();
- assert(I && "Expected non-null start instruction");
- Instruction *E = (EndBI->isImplicitExit())
- ? I->getParent()->getTerminator()
- : EndBI->getInstruction();
- assert(E && "Expected non-null end instruction");
-
- for (; I != E; I = I->getNextNode()) {
- if (!I->mayHaveSideEffects() && !I->mayReadFromMemory())
- continue;
-
- auto IsPotentiallyAffectedByBarrier =
- [](Optional<MemoryLocation> Loc) {
- const Value *Obj = (Loc && Loc->Ptr)
- ? getUnderlyingObject(Loc->Ptr)
- : nullptr;
- if (!Obj) {
- LLVM_DEBUG(
- dbgs()
- << "Access to unknown location requires barriers\n");
- return true;
- }
- if (isa<UndefValue>(Obj))
- return false;
- if (isa<AllocaInst>(Obj))
- return false;
- if (auto *GV = dyn_cast<GlobalVariable>(Obj)) {
- if (GV->isConstant())
- return false;
- if (GV->isThreadLocal())
- return false;
- if (GV->getAddressSpace() == (int)AddressSpace::Local)
- return false;
- if (GV->getAddressSpace() == (int)AddressSpace::Constant)
- return false;
- }
- LLVM_DEBUG(dbgs() << "Access to '" << *Obj
- << "' requires barriers\n");
- return true;
- };
-
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
- Optional<MemoryLocation> Loc = MemoryLocation::getForDest(MI);
- if (IsPotentiallyAffectedByBarrier(Loc))
- return false;
- if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
- Optional<MemoryLocation> Loc =
- MemoryLocation::getForSource(MTI);
- if (IsPotentiallyAffectedByBarrier(Loc))
- return false;
- }
- continue;
- }
-
- if (auto *LI = dyn_cast<LoadInst>(I))
- if (LI->hasMetadata(LLVMContext::MD_invariant_load))
- continue;
-
- Optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I);
- if (IsPotentiallyAffectedByBarrier(Loc))
- return false;
- }
-
- return true;
- };
-
- // Iterate barrier pairs and remove an explicit barrier if analysis
- // deems it removeable.
- for (auto *It = BarriersInBlock.begin(),
- *End = BarriersInBlock.end() - 1;
- It != End; ++It) {
-
- BarrierInfo *StartBI = It;
- BarrierInfo *EndBI = (It + 1);
-
- // Cannot remove when both are implicit barriers, continue.
- if (StartBI->isImplicit() && EndBI->isImplicit())
- continue;
-
- if (!IsBarrierRemoveable(StartBI, EndBI))
- continue;
-
- assert(!(StartBI->isImplicit() && EndBI->isImplicit()) &&
- "Expected at least one explicit barrier to remove.");
-
- // Remove an explicit barrier, check first, then second.
- if (!StartBI->isImplicit()) {
- LLVM_DEBUG(dbgs() << "Remove start barrier "
- << *StartBI->getInstruction() << "\n");
- BarriersToBeDeleted.insert(StartBI->getInstruction());
- } else {
- LLVM_DEBUG(dbgs() << "Remove end barrier "
- << *EndBI->getInstruction() << "\n");
- BarriersToBeDeleted.insert(EndBI->getInstruction());
- }
- }
-
- if (BarriersToBeDeleted.empty())
- continue;
-
- Changed = true;
- for (Instruction *I : BarriersToBeDeleted) {
- ++NumBarriersEliminated;
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "Redundant barrier eliminated.";
- };
-
- if (EnableVerboseRemarks)
- emitRemark<OptimizationRemark>(I, "OMP190", Remark);
- I->eraseFromParent();
- }
- }
- }
-
- return Changed;
- }
-
void analysisGlobalization() {
auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
@@ -1743,10 +1557,14 @@ private:
// function. Used for storing information of the async transfer, allowing to
// wait on it later.
auto &IRBuilder = OMPInfoCache.OMPBuilder;
- auto *F = RuntimeCall.getCaller();
- Instruction *FirstInst = &(F->getEntryBlock().front());
- AllocaInst *Handle = new AllocaInst(
- IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
+ Function *F = RuntimeCall.getCaller();
+ BasicBlock &Entry = F->getEntryBlock();
+ IRBuilder.Builder.SetInsertPoint(&Entry,
+ Entry.getFirstNonPHIOrDbgOrAlloca());
+ Value *Handle = IRBuilder.Builder.CreateAlloca(
+ IRBuilder.AsyncInfo, /*ArraySize=*/nullptr, "handle");
+ Handle =
+ IRBuilder.Builder.CreateAddrSpaceCast(Handle, IRBuilder.AsyncInfoPtr);
// Add "issue" runtime call declaration:
// declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
@@ -1995,7 +1813,7 @@ private:
bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
/// Cache to remember the unique kernel for a function.
- DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
+ DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
/// Find the unique kernel that will execute \p F, if any.
Kernel getUniqueKernelFor(Function &F);
@@ -2055,30 +1873,6 @@ private:
[&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); });
}
- /// RAII struct to temporarily change an RTL function's linkage to external.
- /// This prevents it from being mistakenly removed by other optimizations.
- struct ExternalizationRAII {
- ExternalizationRAII(OMPInformationCache &OMPInfoCache,
- RuntimeFunction RFKind)
- : Declaration(OMPInfoCache.RFIs[RFKind].Declaration) {
- if (!Declaration)
- return;
-
- LinkageType = Declaration->getLinkage();
- Declaration->setLinkage(GlobalValue::ExternalLinkage);
- }
-
- ~ExternalizationRAII() {
- if (!Declaration)
- return;
-
- Declaration->setLinkage(LinkageType);
- }
-
- Function *Declaration;
- GlobalValue::LinkageTypes LinkageType;
- };
-
/// The underlying module.
Module &M;
@@ -2103,21 +1897,6 @@ private:
if (SCC.empty())
return false;
- // Temporarily make these function have external linkage so the Attributor
- // doesn't remove them when we try to look them up later.
- ExternalizationRAII Parallel(OMPInfoCache, OMPRTL___kmpc_kernel_parallel);
- ExternalizationRAII EndParallel(OMPInfoCache,
- OMPRTL___kmpc_kernel_end_parallel);
- ExternalizationRAII BarrierSPMD(OMPInfoCache,
- OMPRTL___kmpc_barrier_simple_spmd);
- ExternalizationRAII BarrierGeneric(OMPInfoCache,
- OMPRTL___kmpc_barrier_simple_generic);
- ExternalizationRAII ThreadId(OMPInfoCache,
- OMPRTL___kmpc_get_hardware_thread_id_in_block);
- ExternalizationRAII NumThreads(
- OMPInfoCache, OMPRTL___kmpc_get_hardware_num_threads_in_block);
- ExternalizationRAII WarpSize(OMPInfoCache, OMPRTL___kmpc_get_warp_size);
-
registerAAs(IsModulePass);
ChangeStatus Changed = A.run();
@@ -2131,17 +1910,22 @@ private:
void registerFoldRuntimeCall(RuntimeFunction RF);
/// Populate the Attributor with abstract attribute opportunities in the
- /// function.
+ /// functions.
void registerAAs(bool IsModulePass);
+
+public:
+ /// Callback to register AAs for live functions, including internal functions
+ /// marked live during the traversal.
+ static void registerAAsForFunction(Attributor &A, const Function &F);
};
Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
- if (!OMPInfoCache.ModuleSlice.count(&F))
+ if (!OMPInfoCache.ModuleSlice.empty() && !OMPInfoCache.ModuleSlice.count(&F))
return nullptr;
// Use a scope to keep the lifetime of the CachedKernel short.
{
- Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
+ std::optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
if (CachedKernel)
return *CachedKernel;
@@ -2327,16 +2111,16 @@ struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
/// Return the value with which \p I can be replaced for specific \p ICV.
- virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
- const Instruction *I,
- Attributor &A) const {
- return None;
+ virtual std::optional<Value *> getReplacementValue(InternalControlVar ICV,
+ const Instruction *I,
+ Attributor &A) const {
+ return std::nullopt;
}
/// Return an assumed unique ICV value if a single candidate is found. If
- /// there cannot be one, return a nullptr. If it is not clear yet, return the
- /// Optional::NoneType.
- virtual Optional<Value *>
+ /// there cannot be one, return a nullptr. If it is not clear yet, return
+ /// std::nullopt.
+ virtual std::optional<Value *>
getUniqueReplacementValue(InternalControlVar ICV) const = 0;
// Currently only nthreads is being tracked.
@@ -2402,7 +2186,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
};
auto CallCheck = [&](Instruction &I) {
- Optional<Value *> ReplVal = getValueForCall(A, I, ICV);
+ std::optional<Value *> ReplVal = getValueForCall(A, I, ICV);
if (ReplVal && ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
HasChanged = ChangeStatus::CHANGED;
@@ -2429,13 +2213,13 @@ struct AAICVTrackerFunction : public AAICVTracker {
/// Helper to check if \p I is a call and get the value for it if it is
/// unique.
- Optional<Value *> getValueForCall(Attributor &A, const Instruction &I,
- InternalControlVar &ICV) const {
+ std::optional<Value *> getValueForCall(Attributor &A, const Instruction &I,
+ InternalControlVar &ICV) const {
const auto *CB = dyn_cast<CallBase>(&I);
if (!CB || CB->hasFnAttr("no_openmp") ||
CB->hasFnAttr("no_openmp_routines"))
- return None;
+ return std::nullopt;
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
@@ -2446,7 +2230,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
if (CalledFunction == nullptr)
return nullptr;
if (CalledFunction == GetterRFI.Declaration)
- return None;
+ return std::nullopt;
if (CalledFunction == SetterRFI.Declaration) {
if (ICVReplacementValuesMap[ICV].count(&I))
return ICVReplacementValuesMap[ICV].lookup(&I);
@@ -2462,7 +2246,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
*this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
if (ICVTrackingAA.isAssumedTracked()) {
- Optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV);
+ std::optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV);
if (!URV || (*URV && AA::isValidAtPosition(AA::ValueAndContext(**URV, I),
OMPInfoCache)))
return URV;
@@ -2472,16 +2256,16 @@ struct AAICVTrackerFunction : public AAICVTracker {
return nullptr;
}
- // We don't check unique value for a function, so return None.
- Optional<Value *>
+ // We don't check unique value for a function, so return std::nullopt.
+ std::optional<Value *>
getUniqueReplacementValue(InternalControlVar ICV) const override {
- return None;
+ return std::nullopt;
}
/// Return the value with which \p I can be replaced for specific \p ICV.
- Optional<Value *> getReplacementValue(InternalControlVar ICV,
- const Instruction *I,
- Attributor &A) const override {
+ std::optional<Value *> getReplacementValue(InternalControlVar ICV,
+ const Instruction *I,
+ Attributor &A) const override {
const auto &ValuesMap = ICVReplacementValuesMap[ICV];
if (ValuesMap.count(I))
return ValuesMap.lookup(I);
@@ -2490,7 +2274,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
SmallPtrSet<const Instruction *, 16> Visited;
Worklist.push_back(I);
- Optional<Value *> ReplVal;
+ std::optional<Value *> ReplVal;
while (!Worklist.empty()) {
const Instruction *CurrInst = Worklist.pop_back_val();
@@ -2503,7 +2287,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
// ICV.
while ((CurrInst = CurrInst->getPrevNode())) {
if (ValuesMap.count(CurrInst)) {
- Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
+ std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
// Unknown value, track new.
if (!ReplVal) {
ReplVal = NewReplVal;
@@ -2518,7 +2302,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
break;
}
- Optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV);
+ std::optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV);
if (!NewReplVal)
continue;
@@ -2566,12 +2350,12 @@ struct AAICVTrackerFunctionReturned : AAICVTracker {
}
// Map of ICV to their values at specific program point.
- EnumeratedArray<Optional<Value *>, InternalControlVar,
+ EnumeratedArray<std::optional<Value *>, InternalControlVar,
InternalControlVar::ICV___last>
ICVReplacementValuesMap;
/// Return the value with which \p I can be replaced for specific \p ICV.
- Optional<Value *>
+ std::optional<Value *>
getUniqueReplacementValue(InternalControlVar ICV) const override {
return ICVReplacementValuesMap[ICV];
}
@@ -2585,11 +2369,11 @@ struct AAICVTrackerFunctionReturned : AAICVTracker {
return indicatePessimisticFixpoint();
for (InternalControlVar ICV : TrackableICVs) {
- Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
- Optional<Value *> UniqueICVValue;
+ std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
+ std::optional<Value *> UniqueICVValue;
auto CheckReturnInst = [&](Instruction &I) {
- Optional<Value *> NewReplVal =
+ std::optional<Value *> NewReplVal =
ICVTrackingAA.getReplacementValue(ICV, &I, A);
// If we found a second ICV value there is no unique returned value.
@@ -2660,7 +2444,7 @@ struct AAICVTrackerCallSite : AAICVTracker {
void trackStatistics() const override {}
InternalControlVar AssociatedICV;
- Optional<Value *> ReplVal;
+ std::optional<Value *> ReplVal;
ChangeStatus updateImpl(Attributor &A) override {
const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
@@ -2670,7 +2454,7 @@ struct AAICVTrackerCallSite : AAICVTracker {
if (!ICVTrackingAA.isAssumedTracked())
return indicatePessimisticFixpoint();
- Optional<Value *> NewReplVal =
+ std::optional<Value *> NewReplVal =
ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
if (ReplVal == NewReplVal)
@@ -2682,7 +2466,7 @@ struct AAICVTrackerCallSite : AAICVTracker {
// Return the value with which associated value can be replaced for specific
// \p ICV.
- Optional<Value *>
+ std::optional<Value *>
getUniqueReplacementValue(InternalControlVar ICV) const override {
return ReplVal;
}
@@ -2706,13 +2490,13 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker {
}
// Map of ICV to their values at specific program point.
- EnumeratedArray<Optional<Value *>, InternalControlVar,
+ EnumeratedArray<std::optional<Value *>, InternalControlVar,
InternalControlVar::ICV___last>
ICVReplacementValuesMap;
/// Return the value with which associated value can be replaced for specific
/// \p ICV.
- Optional<Value *>
+ std::optional<Value *>
getUniqueReplacementValue(InternalControlVar ICV) const override {
return ICVReplacementValuesMap[ICV];
}
@@ -2728,8 +2512,8 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker {
return indicatePessimisticFixpoint();
for (InternalControlVar ICV : TrackableICVs) {
- Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
- Optional<Value *> NewReplVal =
+ std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
+ std::optional<Value *> NewReplVal =
ICVTrackingAA.getUniqueReplacementValue(ICV);
if (ReplVal == NewReplVal)
@@ -2746,77 +2530,216 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
: AAExecutionDomain(IRP, A) {}
+ ~AAExecutionDomainFunction() {
+ delete RPOT;
+ }
+
+ void initialize(Attributor &A) override {
+ if (getAnchorScope()->isDeclaration()) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+ RPOT = new ReversePostOrderTraversal<Function *>(getAnchorScope());
+ }
+
const std::string getAsStr() const override {
- return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) +
- "/" + std::to_string(NumBBs) + " BBs thread 0 only.";
+ unsigned TotalBlocks = 0, InitialThreadBlocks = 0;
+ for (auto &It : BEDMap) {
+ TotalBlocks++;
+ InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
+ }
+ return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) + "/" +
+ std::to_string(TotalBlocks) + " executed by initial thread only";
}
/// See AbstractAttribute::trackStatistics().
void trackStatistics() const override {}
- void initialize(Attributor &A) override {
- Function *F = getAnchorScope();
- for (const auto &BB : *F)
- SingleThreadedBBs.insert(&BB);
- NumBBs = SingleThreadedBBs.size();
- }
-
ChangeStatus manifest(Attributor &A) override {
LLVM_DEBUG({
- for (const BasicBlock *BB : SingleThreadedBBs)
+ for (const BasicBlock &BB : *getAnchorScope()) {
+ if (!isExecutedByInitialThreadOnly(BB))
+ continue;
dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "
- << BB->getName() << " is executed by a single thread.\n";
+ << BB.getName() << " is executed by a single thread.\n";
+ }
});
- return ChangeStatus::UNCHANGED;
- }
- ChangeStatus updateImpl(Attributor &A) override;
+ ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+ if (DisableOpenMPOptBarrierElimination)
+ return Changed;
- /// Check if an instruction is executed by a single thread.
- bool isExecutedByInitialThreadOnly(const Instruction &I) const override {
- return isExecutedByInitialThreadOnly(*I.getParent());
+ SmallPtrSet<CallBase *, 16> DeletedBarriers;
+ auto HandleAlignedBarrier = [&](CallBase *CB) {
+ const ExecutionDomainTy &ED = CEDMap[CB];
+ if (!ED.IsReachedFromAlignedBarrierOnly ||
+ ED.EncounteredNonLocalSideEffect)
+ return;
+
+ // We can remove this barrier, if it is one, or all aligned barriers
+ // reaching the kernel end. In the latter case we can transitively work
+ // our way back until we find a barrier that guards a side-effect if we
+ // are dealing with the kernel end here.
+ if (CB) {
+ DeletedBarriers.insert(CB);
+ A.deleteAfterManifest(*CB);
+ ++NumBarriersEliminated;
+ Changed = ChangeStatus::CHANGED;
+ } else if (!ED.AlignedBarriers.empty()) {
+ NumBarriersEliminated += ED.AlignedBarriers.size();
+ Changed = ChangeStatus::CHANGED;
+ SmallVector<CallBase *> Worklist(ED.AlignedBarriers.begin(),
+ ED.AlignedBarriers.end());
+ SmallSetVector<CallBase *, 16> Visited;
+ while (!Worklist.empty()) {
+ CallBase *LastCB = Worklist.pop_back_val();
+ if (!Visited.insert(LastCB))
+ continue;
+ if (!DeletedBarriers.count(LastCB)) {
+ A.deleteAfterManifest(*LastCB);
+ continue;
+ }
+ // The final aligned barrier (LastCB) reaching the kernel end was
+ // removed already. This means we can go one step further and remove
+ // the barriers encoutered last before (LastCB).
+ const ExecutionDomainTy &LastED = CEDMap[LastCB];
+ Worklist.append(LastED.AlignedBarriers.begin(),
+ LastED.AlignedBarriers.end());
+ }
+ }
+
+ // If we actually eliminated a barrier we need to eliminate the associated
+ // llvm.assumes as well to avoid creating UB.
+ if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
+ for (auto *AssumeCB : ED.EncounteredAssumes)
+ A.deleteAfterManifest(*AssumeCB);
+ };
+
+ for (auto *CB : AlignedBarriers)
+ HandleAlignedBarrier(CB);
+
+ auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+ // Handle the "kernel end barrier" for kernels too.
+ if (OMPInfoCache.Kernels.count(getAnchorScope()))
+ HandleAlignedBarrier(nullptr);
+
+ return Changed;
}
+ /// Merge barrier and assumption information from \p PredED into the successor
+ /// \p ED.
+ void
+ mergeInPredecessorBarriersAndAssumptions(Attributor &A, ExecutionDomainTy &ED,
+ const ExecutionDomainTy &PredED);
+
+ /// Merge all information from \p PredED into the successor \p ED. If
+ /// \p InitialEdgeOnly is set, only the initial edge will enter the block
+ /// represented by \p ED from this predecessor.
+ void mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,
+ const ExecutionDomainTy &PredED,
+ bool InitialEdgeOnly = false);
+
+ /// Accumulate information for the entry block in \p EntryBBED.
+ void handleEntryBB(Attributor &A, ExecutionDomainTy &EntryBBED);
+
+ /// See AbstractAttribute::updateImpl.
+ ChangeStatus updateImpl(Attributor &A) override;
+
+ /// Query interface, see AAExecutionDomain
+ ///{
bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
- return isValidState() && SingleThreadedBBs.contains(&BB);
+ if (!isValidState())
+ return false;
+ return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
}
- /// Set of basic blocks that are executed by a single thread.
- SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs;
+ bool isExecutedInAlignedRegion(Attributor &A,
+ const Instruction &I) const override {
+ if (!isValidState() || isa<CallBase>(I))
+ return false;
- /// Total number of basic blocks in this function.
- long unsigned NumBBs = 0;
-};
+ const Instruction *CurI;
-ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
- Function *F = getAnchorScope();
- ReversePostOrderTraversal<Function *> RPOT(F);
- auto NumSingleThreadedBBs = SingleThreadedBBs.size();
+ // Check forward until a call or the block end is reached.
+ CurI = &I;
+ do {
+ auto *CB = dyn_cast<CallBase>(CurI);
+ if (!CB)
+ continue;
+ const auto &It = CEDMap.find(CB);
+ if (It == CEDMap.end())
+ continue;
+ if (!It->getSecond().IsReachedFromAlignedBarrierOnly)
+ return false;
+ } while ((CurI = CurI->getNextNonDebugInstruction()));
- bool AllCallSitesKnown;
- auto PredForCallSite = [&](AbstractCallSite ACS) {
- const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
- *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
- DepClassTy::REQUIRED);
- return ACS.isDirectCall() &&
- ExecutionDomainAA.isExecutedByInitialThreadOnly(
- *ACS.getInstruction());
- };
+ if (!CurI && !BEDMap.lookup(I.getParent()).IsReachedFromAlignedBarrierOnly)
+ return false;
- if (!A.checkForAllCallSites(PredForCallSite, *this,
- /* RequiresAllCallSites */ true,
- AllCallSitesKnown))
- SingleThreadedBBs.remove(&F->getEntryBlock());
+ // Check backward until a call or the block beginning is reached.
+ CurI = &I;
+ do {
+ auto *CB = dyn_cast<CallBase>(CurI);
+ if (!CB)
+ continue;
+ const auto &It = CEDMap.find(CB);
+ if (It == CEDMap.end())
+ continue;
+ if (!AA::isNoSyncInst(A, *CB, *this)) {
+ if (It->getSecond().IsReachedFromAlignedBarrierOnly)
+ break;
+ return false;
+ }
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
+ Function *Callee = CB->getCalledFunction();
+ if (!Callee || Callee->isDeclaration())
+ return false;
+ const auto &EDAA = A.getAAFor<AAExecutionDomain>(
+ *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
+ if (!EDAA.getState().isValidState())
+ return false;
+ if (!EDAA.getFunctionExecutionDomain().IsReachedFromAlignedBarrierOnly)
+ return false;
+ break;
+ } while ((CurI = CurI->getPrevNonDebugInstruction()));
+
+ if (!CurI &&
+ !llvm::all_of(
+ predecessors(I.getParent()), [&](const BasicBlock *PredBB) {
+ return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
+ })) {
+ return false;
+ }
+
+ // On neither traversal we found a anything but aligned barriers.
+ return true;
+ }
+
+ ExecutionDomainTy getExecutionDomain(const BasicBlock &BB) const override {
+ assert(isValidState() &&
+ "No request should be made against an invalid state!");
+ return BEDMap.lookup(&BB);
+ }
+ ExecutionDomainTy getExecutionDomain(const CallBase &CB) const override {
+ assert(isValidState() &&
+ "No request should be made against an invalid state!");
+ return CEDMap.lookup(&CB);
+ }
+ ExecutionDomainTy getFunctionExecutionDomain() const override {
+ assert(isValidState() &&
+ "No request should be made against an invalid state!");
+ return BEDMap.lookup(nullptr);
+ }
+ ///}
// Check if the edge into the successor block contains a condition that only
// lets the main thread execute it.
- auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
+ static bool isInitialThreadOnlyEdge(Attributor &A, BranchInst *Edge,
+ BasicBlock &SuccessorBB) {
if (!Edge || !Edge->isConditional())
return false;
- if (Edge->getSuccessor(0) != SuccessorBB)
+ if (Edge->getSuccessor(0) != &SuccessorBB)
return false;
auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition());
@@ -2830,6 +2753,8 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
// Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
if (C->isAllOnesValue()) {
auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
+ auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+ auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
if (!CB)
return false;
@@ -2853,30 +2778,335 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
return false;
};
- // Merge all the predecessor states into the current basic block. A basic
- // block is executed by a single thread if all of its predecessors are.
- auto MergePredecessorStates = [&](BasicBlock *BB) {
- if (pred_empty(BB))
- return SingleThreadedBBs.contains(BB);
+ /// Mapping containing information per block.
+ DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
+ DenseMap<const CallBase *, ExecutionDomainTy> CEDMap;
+ SmallSetVector<CallBase *, 16> AlignedBarriers;
+
+ ReversePostOrderTraversal<Function *> *RPOT = nullptr;
+};
+
+void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
+ Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) {
+ for (auto *EA : PredED.EncounteredAssumes)
+ ED.addAssumeInst(A, *EA);
+
+ for (auto *AB : PredED.AlignedBarriers)
+ ED.addAlignedBarrier(A, *AB);
+}
- bool IsInitialThread = true;
- for (BasicBlock *PredBB : predecessors(BB)) {
- if (!IsInitialThreadOnly(dyn_cast<BranchInst>(PredBB->getTerminator()),
- BB))
- IsInitialThread &= SingleThreadedBBs.contains(PredBB);
+void AAExecutionDomainFunction::mergeInPredecessor(
+ Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,
+ bool InitialEdgeOnly) {
+ ED.IsExecutedByInitialThreadOnly =
+ InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
+ ED.IsExecutedByInitialThreadOnly);
+
+ ED.IsReachedFromAlignedBarrierOnly = ED.IsReachedFromAlignedBarrierOnly &&
+ PredED.IsReachedFromAlignedBarrierOnly;
+ ED.EncounteredNonLocalSideEffect =
+ ED.EncounteredNonLocalSideEffect | PredED.EncounteredNonLocalSideEffect;
+ if (ED.IsReachedFromAlignedBarrierOnly)
+ mergeInPredecessorBarriersAndAssumptions(A, ED, PredED);
+ else
+ ED.clearAssumeInstAndAlignedBarriers();
+}
+
+void AAExecutionDomainFunction::handleEntryBB(Attributor &A,
+ ExecutionDomainTy &EntryBBED) {
+ SmallVector<ExecutionDomainTy> PredExecDomains;
+ auto PredForCallSite = [&](AbstractCallSite ACS) {
+ const auto &EDAA = A.getAAFor<AAExecutionDomain>(
+ *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
+ DepClassTy::OPTIONAL);
+ if (!EDAA.getState().isValidState())
+ return false;
+ PredExecDomains.emplace_back(
+ EDAA.getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
+ return true;
+ };
+
+ bool AllCallSitesKnown;
+ if (A.checkForAllCallSites(PredForCallSite, *this,
+ /* RequiresAllCallSites */ true,
+ AllCallSitesKnown)) {
+ for (const auto &PredED : PredExecDomains)
+ mergeInPredecessor(A, EntryBBED, PredED);
+
+ } else {
+ // We could not find all predecessors, so this is either a kernel or a
+ // function with external linkage (or with some other weird uses).
+ auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+ if (OMPInfoCache.Kernels.count(getAnchorScope())) {
+ EntryBBED.IsExecutedByInitialThreadOnly = false;
+ EntryBBED.IsReachedFromAlignedBarrierOnly = true;
+ EntryBBED.EncounteredNonLocalSideEffect = false;
+ } else {
+ EntryBBED.IsExecutedByInitialThreadOnly = false;
+ EntryBBED.IsReachedFromAlignedBarrierOnly = false;
+ EntryBBED.EncounteredNonLocalSideEffect = true;
}
+ }
+
+ auto &FnED = BEDMap[nullptr];
+ FnED.IsReachingAlignedBarrierOnly &=
+ EntryBBED.IsReachedFromAlignedBarrierOnly;
+}
- return IsInitialThread;
+ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
+
+ bool Changed = false;
+
+ // Helper to deal with an aligned barrier encountered during the forward
+ // traversal. \p CB is the aligned barrier, \p ED is the execution domain when
+ // it was encountered.
+ auto HandleAlignedBarrier = [&](CallBase *CB, ExecutionDomainTy &ED) {
+ if (CB)
+ Changed |= AlignedBarriers.insert(CB);
+ // First, update the barrier ED kept in the separate CEDMap.
+ auto &CallED = CEDMap[CB];
+ mergeInPredecessor(A, CallED, ED);
+ // Next adjust the ED we use for the traversal.
+ ED.EncounteredNonLocalSideEffect = false;
+ ED.IsReachedFromAlignedBarrierOnly = true;
+ // Aligned barrier collection has to come last.
+ ED.clearAssumeInstAndAlignedBarriers();
+ if (CB)
+ ED.addAlignedBarrier(A, *CB);
+ };
+
+ auto &LivenessAA =
+ A.getAAFor<AAIsDead>(*this, getIRPosition(), DepClassTy::OPTIONAL);
+
+ // Set \p R to \V and report true if that changed \p R.
+ auto SetAndRecord = [&](bool &R, bool V) {
+ bool Eq = (R == V);
+ R = V;
+ return !Eq;
};
- for (auto *BB : RPOT) {
- if (!MergePredecessorStates(BB))
- SingleThreadedBBs.remove(BB);
+ auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+ Function *F = getAnchorScope();
+ BasicBlock &EntryBB = F->getEntryBlock();
+ bool IsKernel = OMPInfoCache.Kernels.count(F);
+
+ SmallVector<Instruction *> SyncInstWorklist;
+ for (auto &RIt : *RPOT) {
+ BasicBlock &BB = *RIt;
+
+ bool IsEntryBB = &BB == &EntryBB;
+ // TODO: We use local reasoning since we don't have a divergence analysis
+ // running as well. We could basically allow uniform branches here.
+ bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
+ ExecutionDomainTy ED;
+ // Propagate "incoming edges" into information about this block.
+ if (IsEntryBB) {
+ handleEntryBB(A, ED);
+ } else {
+ // For live non-entry blocks we only propagate
+ // information via live edges.
+ if (LivenessAA.isAssumedDead(&BB))
+ continue;
+
+ for (auto *PredBB : predecessors(&BB)) {
+ if (LivenessAA.isEdgeDead(PredBB, &BB))
+ continue;
+ bool InitialEdgeOnly = isInitialThreadOnlyEdge(
+ A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
+ mergeInPredecessor(A, ED, BEDMap[PredBB], InitialEdgeOnly);
+ }
+ }
+
+ // Now we traverse the block, accumulate effects in ED and attach
+ // information to calls.
+ for (Instruction &I : BB) {
+ bool UsedAssumedInformation;
+ if (A.isAssumedDead(I, *this, &LivenessAA, UsedAssumedInformation,
+ /* CheckBBLivenessOnly */ false, DepClassTy::OPTIONAL,
+ /* CheckForDeadStore */ true))
+ continue;
+
+ // Asummes and "assume-like" (dbg, lifetime, ...) are handled first, the
+ // former is collected the latter is ignored.
+ if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ if (auto *AI = dyn_cast_or_null<AssumeInst>(II)) {
+ ED.addAssumeInst(A, *AI);
+ continue;
+ }
+ // TODO: Should we also collect and delete lifetime markers?
+ if (II->isAssumeLikeIntrinsic())
+ continue;
+ }
+
+ auto *CB = dyn_cast<CallBase>(&I);
+ bool IsNoSync = AA::isNoSyncInst(A, I, *this);
+ bool IsAlignedBarrier =
+ !IsNoSync && CB &&
+ AANoSync::isAlignedBarrier(*CB, AlignedBarrierLastInBlock);
+
+ AlignedBarrierLastInBlock &= IsNoSync;
+
+ // Next we check for calls. Aligned barriers are handled
+ // explicitly, everything else is kept for the backward traversal and will
+ // also affect our state.
+ if (CB) {
+ if (IsAlignedBarrier) {
+ HandleAlignedBarrier(CB, ED);
+ AlignedBarrierLastInBlock = true;
+ continue;
+ }
+
+ // Check the pointer(s) of a memory intrinsic explicitly.
+ if (isa<MemIntrinsic>(&I)) {
+ if (!ED.EncounteredNonLocalSideEffect &&
+ AA::isPotentiallyAffectedByBarrier(A, I, *this))
+ ED.EncounteredNonLocalSideEffect = true;
+ if (!IsNoSync) {
+ ED.IsReachedFromAlignedBarrierOnly = false;
+ SyncInstWorklist.push_back(&I);
+ }
+ continue;
+ }
+
+ // Record how we entered the call, then accumulate the effect of the
+ // call in ED for potential use by the callee.
+ auto &CallED = CEDMap[CB];
+ mergeInPredecessor(A, CallED, ED);
+
+ // If we have a sync-definition we can check if it starts/ends in an
+ // aligned barrier. If we are unsure we assume any sync breaks
+ // alignment.
+ Function *Callee = CB->getCalledFunction();
+ if (!IsNoSync && Callee && !Callee->isDeclaration()) {
+ const auto &EDAA = A.getAAFor<AAExecutionDomain>(
+ *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
+ if (EDAA.getState().isValidState()) {
+ const auto &CalleeED = EDAA.getFunctionExecutionDomain();
+ ED.IsReachedFromAlignedBarrierOnly =
+ CalleeED.IsReachedFromAlignedBarrierOnly;
+ AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
+ if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
+ ED.EncounteredNonLocalSideEffect |=
+ CalleeED.EncounteredNonLocalSideEffect;
+ else
+ ED.EncounteredNonLocalSideEffect =
+ CalleeED.EncounteredNonLocalSideEffect;
+ if (!CalleeED.IsReachingAlignedBarrierOnly)
+ SyncInstWorklist.push_back(&I);
+ if (CalleeED.IsReachedFromAlignedBarrierOnly)
+ mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED);
+ continue;
+ }
+ }
+ ED.IsReachedFromAlignedBarrierOnly =
+ IsNoSync && ED.IsReachedFromAlignedBarrierOnly;
+ AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
+ ED.EncounteredNonLocalSideEffect |= !CB->doesNotAccessMemory();
+ if (!IsNoSync)
+ SyncInstWorklist.push_back(&I);
+ }
+
+ if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
+ continue;
+
+ // If we have a callee we try to use fine-grained information to
+ // determine local side-effects.
+ if (CB) {
+ const auto &MemAA = A.getAAFor<AAMemoryLocation>(
+ *this, IRPosition::callsite_function(*CB), DepClassTy::OPTIONAL);
+
+ auto AccessPred = [&](const Instruction *I, const Value *Ptr,
+ AAMemoryLocation::AccessKind,
+ AAMemoryLocation::MemoryLocationsKind) {
+ return !AA::isPotentiallyAffectedByBarrier(A, {Ptr}, *this, I);
+ };
+ if (MemAA.getState().isValidState() &&
+ MemAA.checkForAllAccessesToMemoryKind(
+ AccessPred, AAMemoryLocation::ALL_LOCATIONS))
+ continue;
+ }
+
+ if (!I.mayHaveSideEffects() && OMPInfoCache.isOnlyUsedByAssume(I))
+ continue;
+
+ if (auto *LI = dyn_cast<LoadInst>(&I))
+ if (LI->hasMetadata(LLVMContext::MD_invariant_load))
+ continue;
+
+ if (!ED.EncounteredNonLocalSideEffect &&
+ AA::isPotentiallyAffectedByBarrier(A, I, *this))
+ ED.EncounteredNonLocalSideEffect = true;
+ }
+
+ if (!isa<UnreachableInst>(BB.getTerminator()) &&
+ !BB.getTerminator()->getNumSuccessors()) {
+
+ auto &FnED = BEDMap[nullptr];
+ mergeInPredecessor(A, FnED, ED);
+
+ if (IsKernel)
+ HandleAlignedBarrier(nullptr, ED);
+ }
+
+ ExecutionDomainTy &StoredED = BEDMap[&BB];
+ ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly;
+
+ // Check if we computed anything different as part of the forward
+ // traversal. We do not take assumptions and aligned barriers into account
+ // as they do not influence the state we iterate. Backward traversal values
+ // are handled later on.
+ if (ED.IsExecutedByInitialThreadOnly !=
+ StoredED.IsExecutedByInitialThreadOnly ||
+ ED.IsReachedFromAlignedBarrierOnly !=
+ StoredED.IsReachedFromAlignedBarrierOnly ||
+ ED.EncounteredNonLocalSideEffect !=
+ StoredED.EncounteredNonLocalSideEffect)
+ Changed = true;
+
+ // Update the state with the new value.
+ StoredED = std::move(ED);
}
- return (NumSingleThreadedBBs == SingleThreadedBBs.size())
- ? ChangeStatus::UNCHANGED
- : ChangeStatus::CHANGED;
+ // Propagate (non-aligned) sync instruction effects backwards until the
+ // entry is hit or an aligned barrier.
+ SmallSetVector<BasicBlock *, 16> Visited;
+ while (!SyncInstWorklist.empty()) {
+ Instruction *SyncInst = SyncInstWorklist.pop_back_val();
+ Instruction *CurInst = SyncInst;
+ bool HitAlignedBarrier = false;
+ while ((CurInst = CurInst->getPrevNode())) {
+ auto *CB = dyn_cast<CallBase>(CurInst);
+ if (!CB)
+ continue;
+ auto &CallED = CEDMap[CB];
+ if (SetAndRecord(CallED.IsReachingAlignedBarrierOnly, false))
+ Changed = true;
+ HitAlignedBarrier = AlignedBarriers.count(CB);
+ if (HitAlignedBarrier)
+ break;
+ }
+ if (HitAlignedBarrier)
+ continue;
+ BasicBlock *SyncBB = SyncInst->getParent();
+ for (auto *PredBB : predecessors(SyncBB)) {
+ if (LivenessAA.isEdgeDead(PredBB, SyncBB))
+ continue;
+ if (!Visited.insert(PredBB))
+ continue;
+ SyncInstWorklist.push_back(PredBB->getTerminator());
+ auto &PredED = BEDMap[PredBB];
+ if (SetAndRecord(PredED.IsReachingAlignedBarrierOnly, false))
+ Changed = true;
+ }
+ if (SyncBB != &EntryBB)
+ continue;
+ auto &FnED = BEDMap[nullptr];
+ if (SetAndRecord(FnED.IsReachingAlignedBarrierOnly, false))
+ Changed = true;
+ }
+
+ return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
}
/// Try to replace memory allocation calls called by a single thread with a
@@ -2955,12 +3185,18 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+ if (!RFI.Declaration)
+ return;
Attributor::SimplifictionCallbackTy SCB =
[](const IRPosition &, const AbstractAttribute *,
- bool &) -> Optional<Value *> { return nullptr; };
+ bool &) -> std::optional<Value *> { return nullptr; };
+
+ Function *F = getAnchorScope();
for (User *U : RFI.Declaration->users())
if (CallBase *CB = dyn_cast<CallBase>(U)) {
+ if (CB->getFunction() != F)
+ continue;
MallocCalls.insert(CB);
A.registerSimplificationCallback(IRPosition::callsite_returned(*CB),
SCB);
@@ -3057,20 +3293,33 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
}
ChangeStatus updateImpl(Attributor &A) override {
+ if (MallocCalls.empty())
+ return indicatePessimisticFixpoint();
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+ if (!RFI.Declaration)
+ return ChangeStatus::UNCHANGED;
+
Function *F = getAnchorScope();
auto NumMallocCalls = MallocCalls.size();
// Only consider malloc calls executed by a single thread with a constant.
for (User *U : RFI.Declaration->users()) {
- const auto &ED = A.getAAFor<AAExecutionDomain>(
- *this, IRPosition::function(*F), DepClassTy::REQUIRED);
- if (CallBase *CB = dyn_cast<CallBase>(U))
- if (!isa<ConstantInt>(CB->getArgOperand(0)) ||
- !ED.isExecutedByInitialThreadOnly(*CB))
+ if (CallBase *CB = dyn_cast<CallBase>(U)) {
+ if (CB->getCaller() != F)
+ continue;
+ if (!MallocCalls.count(CB))
+ continue;
+ if (!isa<ConstantInt>(CB->getArgOperand(0))) {
+ MallocCalls.remove(CB);
+ continue;
+ }
+ const auto &ED = A.getAAFor<AAExecutionDomain>(
+ *this, IRPosition::function(*F), DepClassTy::REQUIRED);
+ if (!ED.isExecutedByInitialThreadOnly(*CB))
MallocCalls.remove(CB);
+ }
}
findPotentialRemovedFreeCalls(A);
@@ -3115,6 +3364,10 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
", #Reaching Kernels: " +
(ReachingKernelEntries.isValidState()
? std::to_string(ReachingKernelEntries.size())
+ : "<invalid>") +
+ ", #ParLevels: " +
+ (ParallelLevels.isValidState()
+ ? std::to_string(ParallelLevels.size())
: "<invalid>");
}
@@ -3202,7 +3455,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
[&](const IRPosition &IRP, const AbstractAttribute *AA,
- bool &UsedAssumedInformation) -> Optional<Value *> {
+ bool &UsedAssumedInformation) -> std::optional<Value *> {
// IRP represents the "use generic state machine" argument of an
// __kmpc_target_init call. We will answer this one with the internal
// state. As long as we are not in an invalid state, we will create a
@@ -3223,7 +3476,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
Attributor::SimplifictionCallbackTy ModeSimplifyCB =
[&](const IRPosition &IRP, const AbstractAttribute *AA,
- bool &UsedAssumedInformation) -> Optional<Value *> {
+ bool &UsedAssumedInformation) -> std::optional<Value *> {
// IRP represents the "SPMDCompatibilityTracker" argument of an
// __kmpc_target_init or
// __kmpc_target_deinit call. We will answer this one with the internal
@@ -3244,32 +3497,9 @@ struct AAKernelInfoFunction : AAKernelInfo {
return Val;
};
- Attributor::SimplifictionCallbackTy IsGenericModeSimplifyCB =
- [&](const IRPosition &IRP, const AbstractAttribute *AA,
- bool &UsedAssumedInformation) -> Optional<Value *> {
- // IRP represents the "RequiresFullRuntime" argument of an
- // __kmpc_target_init or __kmpc_target_deinit call. We will answer this
- // one with the internal state of the SPMDCompatibilityTracker, so if
- // generic then true, if SPMD then false.
- if (!SPMDCompatibilityTracker.isValidState())
- return nullptr;
- if (!SPMDCompatibilityTracker.isAtFixpoint()) {
- if (AA)
- A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
- UsedAssumedInformation = true;
- } else {
- UsedAssumedInformation = false;
- }
- auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(),
- !SPMDCompatibilityTracker.isAssumed());
- return Val;
- };
-
constexpr const int InitModeArgNo = 1;
constexpr const int DeinitModeArgNo = 1;
constexpr const int InitUseStateMachineArgNo = 2;
- constexpr const int InitRequiresFullRuntimeArgNo = 3;
- constexpr const int DeinitRequiresFullRuntimeArgNo = 2;
A.registerSimplificationCallback(
IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
StateMachineSimplifyCB);
@@ -3279,14 +3509,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
A.registerSimplificationCallback(
IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo),
ModeSimplifyCB);
- A.registerSimplificationCallback(
- IRPosition::callsite_argument(*KernelInitCB,
- InitRequiresFullRuntimeArgNo),
- IsGenericModeSimplifyCB);
- A.registerSimplificationCallback(
- IRPosition::callsite_argument(*KernelDeinitCB,
- DeinitRequiresFullRuntimeArgNo),
- IsGenericModeSimplifyCB);
// Check if we know we are in SPMD-mode already.
ConstantInt *ModeArg =
@@ -3296,6 +3518,84 @@ struct AAKernelInfoFunction : AAKernelInfo {
// This is a generic region but SPMDization is disabled so stop tracking.
else if (DisableOpenMPOptSPMDization)
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+
+ // Register virtual uses of functions we might need to preserve.
+ auto RegisterVirtualUse = [&](RuntimeFunction RFKind,
+ Attributor::VirtualUseCallbackTy &CB) {
+ if (!OMPInfoCache.RFIs[RFKind].Declaration)
+ return;
+ A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
+ };
+
+ // Add a dependence to ensure updates if the state changes.
+ auto AddDependence = [](Attributor &A, const AAKernelInfo *KI,
+ const AbstractAttribute *QueryingAA) {
+ if (QueryingAA) {
+ A.recordDependence(*KI, *QueryingAA, DepClassTy::OPTIONAL);
+ }
+ return true;
+ };
+
+ Attributor::VirtualUseCallbackTy CustomStateMachineUseCB =
+ [&](Attributor &A, const AbstractAttribute *QueryingAA) {
+ // Whenever we create a custom state machine we will insert calls to
+ // __kmpc_get_hardware_num_threads_in_block,
+ // __kmpc_get_warp_size,
+ // __kmpc_barrier_simple_generic,
+ // __kmpc_kernel_parallel, and
+ // __kmpc_kernel_end_parallel.
+ // Not needed if we are on track for SPMDzation.
+ if (SPMDCompatibilityTracker.isValidState())
+ return AddDependence(A, this, QueryingAA);
+ // Not needed if we can't rewrite due to an invalid state.
+ if (!ReachedKnownParallelRegions.isValidState())
+ return AddDependence(A, this, QueryingAA);
+ return false;
+ };
+
+ // Not needed if we are pre-runtime merge.
+ if (!KernelInitCB->getCalledFunction()->isDeclaration()) {
+ RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
+ CustomStateMachineUseCB);
+ RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
+ RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
+ CustomStateMachineUseCB);
+ RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
+ CustomStateMachineUseCB);
+ RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
+ CustomStateMachineUseCB);
+ }
+
+ // If we do not perform SPMDzation we do not need the virtual uses below.
+ if (SPMDCompatibilityTracker.isAtFixpoint())
+ return;
+
+ Attributor::VirtualUseCallbackTy HWThreadIdUseCB =
+ [&](Attributor &A, const AbstractAttribute *QueryingAA) {
+ // Whenever we perform SPMDzation we will insert
+ // __kmpc_get_hardware_thread_id_in_block calls.
+ if (!SPMDCompatibilityTracker.isValidState())
+ return AddDependence(A, this, QueryingAA);
+ return false;
+ };
+ RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
+ HWThreadIdUseCB);
+
+ Attributor::VirtualUseCallbackTy SPMDBarrierUseCB =
+ [&](Attributor &A, const AbstractAttribute *QueryingAA) {
+ // Whenever we perform SPMDzation with guarding we will insert
+ // __kmpc_simple_barrier_spmd calls. If SPMDzation failed, there is
+ // nothing to guard, or there are no parallel regions, we don't need
+ // the calls.
+ if (!SPMDCompatibilityTracker.isValidState())
+ return AddDependence(A, this, QueryingAA);
+ if (SPMDCompatibilityTracker.empty())
+ return AddDependence(A, this, QueryingAA);
+ if (!mayContainParallelRegion())
+ return AddDependence(A, this, QueryingAA);
+ return false;
+ };
+ RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
}
/// Sanitize the string \p S such that it is a suitable global symbol name.
@@ -3318,77 +3618,29 @@ struct AAKernelInfoFunction : AAKernelInfo {
if (!KernelInitCB || !KernelDeinitCB)
return ChangeStatus::UNCHANGED;
+ /// Insert nested Parallelism global variable
+ Function *Kernel = getAnchorScope();
+ Module &M = *Kernel->getParent();
+ Type *Int8Ty = Type::getInt8Ty(M.getContext());
+ new GlobalVariable(M, Int8Ty, /* isConstant */ true,
+ GlobalValue::WeakAnyLinkage,
+ ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0),
+ Kernel->getName() + "_nested_parallelism");
+
// If we can we change the execution mode to SPMD-mode otherwise we build a
// custom state machine.
ChangeStatus Changed = ChangeStatus::UNCHANGED;
- if (!changeToSPMDMode(A, Changed))
- return buildCustomStateMachine(A);
+ if (!changeToSPMDMode(A, Changed)) {
+ if (!KernelInitCB->getCalledFunction()->isDeclaration())
+ return buildCustomStateMachine(A);
+ }
return Changed;
}
- bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
- if (!mayContainParallelRegion())
- return false;
-
+ void insertInstructionGuardsHelper(Attributor &A) {
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- if (!SPMDCompatibilityTracker.isAssumed()) {
- for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
- if (!NonCompatibleI)
- continue;
-
- // Skip diagnostics on calls to known OpenMP runtime functions for now.
- if (auto *CB = dyn_cast<CallBase>(NonCompatibleI))
- if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
- continue;
-
- auto Remark = [&](OptimizationRemarkAnalysis ORA) {
- ORA << "Value has potential side effects preventing SPMD-mode "
- "execution";
- if (isa<CallBase>(NonCompatibleI)) {
- ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to "
- "the called function to override";
- }
- return ORA << ".";
- };
- A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121",
- Remark);
-
- LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "
- << *NonCompatibleI << "\n");
- }
-
- return false;
- }
-
- // Get the actual kernel, could be the caller of the anchor scope if we have
- // a debug wrapper.
- Function *Kernel = getAnchorScope();
- if (Kernel->hasLocalLinkage()) {
- assert(Kernel->hasOneUse() && "Unexpected use of debug kernel wrapper.");
- auto *CB = cast<CallBase>(Kernel->user_back());
- Kernel = CB->getCaller();
- }
- assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!");
-
- // Check if the kernel is already in SPMD mode, if so, return success.
- GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
- (Kernel->getName() + "_exec_mode").str());
- assert(ExecMode && "Kernel without exec mode?");
- assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
-
- // Set the global exec mode flag to indicate SPMD-Generic mode.
- assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
- "ExecMode is not an integer!");
- const int8_t ExecModeVal =
- cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
- if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
- return true;
-
- // We will now unconditionally modify the IR, indicate a change.
- Changed = ChangeStatus::CHANGED;
-
auto CreateGuardedRegion = [&](Instruction *RegionStartI,
Instruction *RegionEndI) {
LoopInfo *LI = nullptr;
@@ -3605,6 +3857,125 @@ struct AAKernelInfoFunction : AAKernelInfo {
for (auto &GR : GuardedRegions)
CreateGuardedRegion(GR.first, GR.second);
+ }
+
+ void forceSingleThreadPerWorkgroupHelper(Attributor &A) {
+ // Only allow 1 thread per workgroup to continue executing the user code.
+ //
+ // InitCB = __kmpc_target_init(...)
+ // ThreadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
+ // if (ThreadIdInBlock != 0) return;
+ // UserCode:
+ // // user code
+ //
+ auto &Ctx = getAnchorValue().getContext();
+ Function *Kernel = getAssociatedFunction();
+ assert(Kernel && "Expected an associated function!");
+
+ // Create block for user code to branch to from initial block.
+ BasicBlock *InitBB = KernelInitCB->getParent();
+ BasicBlock *UserCodeBB = InitBB->splitBasicBlock(
+ KernelInitCB->getNextNode(), "main.thread.user_code");
+ BasicBlock *ReturnBB =
+ BasicBlock::Create(Ctx, "exit.threads", Kernel, UserCodeBB);
+
+ // Register blocks with attributor:
+ A.registerManifestAddedBasicBlock(*InitBB);
+ A.registerManifestAddedBasicBlock(*UserCodeBB);
+ A.registerManifestAddedBasicBlock(*ReturnBB);
+
+ // Debug location:
+ const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
+ ReturnInst::Create(Ctx, ReturnBB)->setDebugLoc(DLoc);
+ InitBB->getTerminator()->eraseFromParent();
+
+ // Prepare call to OMPRTL___kmpc_get_hardware_thread_id_in_block.
+ Module &M = *Kernel->getParent();
+ auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+ FunctionCallee ThreadIdInBlockFn =
+ OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+ M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
+
+ // Get thread ID in block.
+ CallInst *ThreadIdInBlock =
+ CallInst::Create(ThreadIdInBlockFn, "thread_id.in.block", InitBB);
+ OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
+ ThreadIdInBlock->setDebugLoc(DLoc);
+
+ // Eliminate all threads in the block with ID not equal to 0:
+ Instruction *IsMainThread =
+ ICmpInst::Create(ICmpInst::ICmp, CmpInst::ICMP_NE, ThreadIdInBlock,
+ ConstantInt::get(ThreadIdInBlock->getType(), 0),
+ "thread.is_main", InitBB);
+ IsMainThread->setDebugLoc(DLoc);
+ BranchInst::Create(ReturnBB, UserCodeBB, IsMainThread, InitBB);
+ }
+
+ bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
+ auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+ if (!SPMDCompatibilityTracker.isAssumed()) {
+ for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
+ if (!NonCompatibleI)
+ continue;
+
+ // Skip diagnostics on calls to known OpenMP runtime functions for now.
+ if (auto *CB = dyn_cast<CallBase>(NonCompatibleI))
+ if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
+ continue;
+
+ auto Remark = [&](OptimizationRemarkAnalysis ORA) {
+ ORA << "Value has potential side effects preventing SPMD-mode "
+ "execution";
+ if (isa<CallBase>(NonCompatibleI)) {
+ ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to "
+ "the called function to override";
+ }
+ return ORA << ".";
+ };
+ A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121",
+ Remark);
+
+ LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "
+ << *NonCompatibleI << "\n");
+ }
+
+ return false;
+ }
+
+ // Get the actual kernel, could be the caller of the anchor scope if we have
+ // a debug wrapper.
+ Function *Kernel = getAnchorScope();
+ if (Kernel->hasLocalLinkage()) {
+ assert(Kernel->hasOneUse() && "Unexpected use of debug kernel wrapper.");
+ auto *CB = cast<CallBase>(Kernel->user_back());
+ Kernel = CB->getCaller();
+ }
+ assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!");
+
+ // Check if the kernel is already in SPMD mode, if so, return success.
+ GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
+ (Kernel->getName() + "_exec_mode").str());
+ assert(ExecMode && "Kernel without exec mode?");
+ assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
+
+ // Set the global exec mode flag to indicate SPMD-Generic mode.
+ assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
+ "ExecMode is not an integer!");
+ const int8_t ExecModeVal =
+ cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
+ if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
+ return true;
+
+ // We will now unconditionally modify the IR, indicate a change.
+ Changed = ChangeStatus::CHANGED;
+
+ // Do not use instruction guards when no parallel is present inside
+ // the target region.
+ if (mayContainParallelRegion())
+ insertInstructionGuardsHelper(A);
+ else
+ forceSingleThreadPerWorkgroupHelper(A);
// Adjust the global exec mode flag that tells the runtime what mode this
// kernel is executed in.
@@ -3618,8 +3989,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
const int InitModeArgNo = 1;
const int DeinitModeArgNo = 1;
const int InitUseStateMachineArgNo = 2;
- const int InitRequiresFullRuntimeArgNo = 3;
- const int DeinitRequiresFullRuntimeArgNo = 2;
auto &Ctx = getAnchorValue().getContext();
A.changeUseAfterManifest(
@@ -3633,12 +4002,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
KernelDeinitCB->getArgOperandUse(DeinitModeArgNo),
*ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
OMP_TGT_EXEC_MODE_SPMD));
- A.changeUseAfterManifest(
- KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo),
- *ConstantInt::getBool(Ctx, false));
- A.changeUseAfterManifest(
- KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo),
- *ConstantInt::getBool(Ctx, false));
++NumOpenMPTargetRegionKernelsSPMD;
@@ -3982,23 +4345,21 @@ struct AAKernelInfoFunction : AAKernelInfo {
if (!I.mayWriteToMemory())
return true;
if (auto *SI = dyn_cast<StoreInst>(&I)) {
- SmallVector<const Value *> Objects;
- getUnderlyingObjects(SI->getPointerOperand(), Objects);
- if (llvm::all_of(Objects,
- [](const Value *Obj) { return isa<AllocaInst>(Obj); }))
- return true;
- // Check for AAHeapToStack moved objects which must not be guarded.
+ const auto &UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>(
+ *this, IRPosition::value(*SI->getPointerOperand()),
+ DepClassTy::OPTIONAL);
auto &HS = A.getAAFor<AAHeapToStack>(
*this, IRPosition::function(*I.getFunction()),
DepClassTy::OPTIONAL);
- if (llvm::all_of(Objects, [&HS](const Value *Obj) {
- auto *CB = dyn_cast<CallBase>(Obj);
- if (!CB)
- return false;
- return HS.isAssumedHeapToStack(*CB);
- })) {
+ if (UnderlyingObjsAA.forallUnderlyingObjects([&](Value &Obj) {
+ if (AA::isAssumedThreadLocalObject(A, Obj, *this))
+ return true;
+ // Check for AAHeapToStack moved objects which must not be
+ // guarded.
+ auto *CB = dyn_cast<CallBase>(&Obj);
+ return CB && HS.isAssumedHeapToStack(*CB);
+ }))
return true;
- }
}
// Insert instruction that needs guarding.
@@ -4020,28 +4381,30 @@ struct AAKernelInfoFunction : AAKernelInfo {
updateReachingKernelEntries(A, AllReachingKernelsKnown);
UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
- if (!ParallelLevels.isValidState())
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- else if (!ReachingKernelEntries.isValidState())
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
- else if (!SPMDCompatibilityTracker.empty()) {
- // Check if all reaching kernels agree on the mode as we can otherwise
- // not guard instructions. We might not be sure about the mode so we
- // we cannot fix the internal spmd-zation state either.
- int SPMD = 0, Generic = 0;
- for (auto *Kernel : ReachingKernelEntries) {
- auto &CBAA = A.getAAFor<AAKernelInfo>(
- *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL);
- if (CBAA.SPMDCompatibilityTracker.isValidState() &&
- CBAA.SPMDCompatibilityTracker.isAssumed())
- ++SPMD;
- else
- ++Generic;
- if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint())
- UsedAssumedInformationFromReachingKernels = true;
- }
- if (SPMD != 0 && Generic != 0)
+ if (!SPMDCompatibilityTracker.empty()) {
+ if (!ParallelLevels.isValidState())
+ SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+ else if (!ReachingKernelEntries.isValidState())
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+ else {
+ // Check if all reaching kernels agree on the mode as we can otherwise
+ // not guard instructions. We might not be sure about the mode so we
+ // we cannot fix the internal spmd-zation state either.
+ int SPMD = 0, Generic = 0;
+ for (auto *Kernel : ReachingKernelEntries) {
+ auto &CBAA = A.getAAFor<AAKernelInfo>(
+ *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL);
+ if (CBAA.SPMDCompatibilityTracker.isValidState() &&
+ CBAA.SPMDCompatibilityTracker.isAssumed())
+ ++SPMD;
+ else
+ ++Generic;
+ if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint())
+ UsedAssumedInformationFromReachingKernels = true;
+ }
+ if (SPMD != 0 && Generic != 0)
+ SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+ }
}
}
@@ -4077,15 +4440,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
}
- // If we are sure there are no parallel regions in the kernel we do not
- // want SPMD mode.
- if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() &&
- ReachedKnownParallelRegions.isAtFixpoint() &&
- ReachedUnknownParallelRegions.isValidState() &&
- ReachedKnownParallelRegions.isValidState() &&
- !mayContainParallelRegion())
- SPMDCompatibilityTracker.indicatePessimisticFixpoint();
-
// If we haven't used any assumed information for the SPMD state we can fix
// it.
if (!UsedAssumedInformationInCheckRWInst &&
@@ -4288,6 +4642,12 @@ struct AAKernelInfoCallSite : AAKernelInfo {
if (auto *ParallelRegion = dyn_cast<Function>(
CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
ReachedKnownParallelRegions.insert(ParallelRegion);
+ /// Check nested parallelism
+ auto &FnAA = A.getAAFor<AAKernelInfo>(
+ *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL);
+ NestedParallelism |= !FnAA.getState().isValidState() ||
+ !FnAA.ReachedKnownParallelRegions.empty() ||
+ !FnAA.ReachedUnknownParallelRegions.empty();
break;
}
// The condition above should usually get the parallel region function
@@ -4419,10 +4779,10 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
if (!SimplifiedValue)
return Str + std::string("none");
- if (!SimplifiedValue.value())
+ if (!*SimplifiedValue)
return Str + std::string("nullptr");
- if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.value()))
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(*SimplifiedValue))
return Str + std::to_string(CI->getSExtValue());
return Str + std::string("unknown");
@@ -4445,9 +4805,9 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
A.registerSimplificationCallback(
IRPosition::callsite_returned(CB),
[&](const IRPosition &IRP, const AbstractAttribute *AA,
- bool &UsedAssumedInformation) -> Optional<Value *> {
+ bool &UsedAssumedInformation) -> std::optional<Value *> {
assert((isValidState() ||
- (SimplifiedValue && SimplifiedValue.value() == nullptr)) &&
+ (SimplifiedValue && *SimplifiedValue == nullptr)) &&
"Unexpected invalid state!");
if (!isAtFixpoint()) {
@@ -4465,9 +4825,6 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
case OMPRTL___kmpc_is_spmd_exec_mode:
Changed |= foldIsSPMDExecMode(A);
break;
- case OMPRTL___kmpc_is_generic_main_thread_id:
- Changed |= foldIsGenericMainThread(A);
- break;
case OMPRTL___kmpc_parallel_level:
Changed |= foldParallelLevel(A);
break;
@@ -4522,7 +4879,7 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
private:
/// Fold __kmpc_is_spmd_exec_mode into a constant if possible.
ChangeStatus foldIsSPMDExecMode(Attributor &A) {
- Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
+ std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
@@ -4582,31 +4939,9 @@ private:
: ChangeStatus::CHANGED;
}
- /// Fold __kmpc_is_generic_main_thread_id into a constant if possible.
- ChangeStatus foldIsGenericMainThread(Attributor &A) {
- Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
-
- CallBase &CB = cast<CallBase>(getAssociatedValue());
- Function *F = CB.getFunction();
- const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
- *this, IRPosition::function(*F), DepClassTy::REQUIRED);
-
- if (!ExecutionDomainAA.isValidState())
- return indicatePessimisticFixpoint();
-
- auto &Ctx = getAnchorValue().getContext();
- if (ExecutionDomainAA.isExecutedByInitialThreadOnly(CB))
- SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
- else
- return indicatePessimisticFixpoint();
-
- return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
- : ChangeStatus::CHANGED;
- }
-
/// Fold __kmpc_parallel_level into a constant if possible.
ChangeStatus foldParallelLevel(Attributor &A) {
- Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
+ std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
*this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
@@ -4668,7 +5003,7 @@ private:
ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) {
// Specialize only if all the calls agree with the attribute constant value
int32_t CurrentAttrValue = -1;
- Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
+ std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
*this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
@@ -4678,10 +5013,7 @@ private:
// Iterate over the kernels that reach this function
for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
- int32_t NextAttrVal = -1;
- if (K->hasFnAttribute(Attr))
- NextAttrVal =
- std::stoi(K->getFnAttribute(Attr).getValueAsString().str());
+ int32_t NextAttrVal = K->getFnAttributeAsParsedInteger(Attr, -1);
if (NextAttrVal == -1 ||
(CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
@@ -4701,7 +5033,7 @@ private:
/// An optional value the associated value is assumed to fold to. That is, we
/// assume the associated value (which is a call) can be replaced by this
/// simplified value.
- Optional<Value *> SimplifiedValue;
+ std::optional<Value *> SimplifiedValue;
/// The runtime function kind of the callee of the associated call site.
RuntimeFunction RFKind;
@@ -4744,7 +5076,6 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
InitRFI.foreachUse(SCC, CreateKernelInfoCB);
- registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id);
registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
@@ -4752,32 +5083,27 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
}
// Create CallSite AA for all Getters.
- for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
- auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
+ if (DeduceICVValues) {
+ for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
+ auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
- auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
+ auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
- auto CreateAA = [&](Use &U, Function &Caller) {
- CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
- if (!CI)
- return false;
+ auto CreateAA = [&](Use &U, Function &Caller) {
+ CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
+ if (!CI)
+ return false;
- auto &CB = cast<CallBase>(*CI);
+ auto &CB = cast<CallBase>(*CI);
- IRPosition CBPos = IRPosition::callsite_function(CB);
- A.getOrCreateAAFor<AAICVTracker>(CBPos);
- return false;
- };
+ IRPosition CBPos = IRPosition::callsite_function(CB);
+ A.getOrCreateAAFor<AAICVTracker>(CBPos);
+ return false;
+ };
- GetterRFI.foreachUse(SCC, CreateAA);
+ GetterRFI.foreachUse(SCC, CreateAA);
+ }
}
- auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
- auto CreateAA = [&](Use &U, Function &F) {
- A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
- return false;
- };
- if (!DisableOpenMPOptDeglobalization)
- GlobalizationRFI.foreachUse(SCC, CreateAA);
// Create an ExecutionDomain AA for every function and a HeapToStack AA for
// every function if there is a device kernel.
@@ -4788,17 +5114,44 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
if (F->isDeclaration())
continue;
- A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
- if (!DisableOpenMPOptDeglobalization)
- A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
+ // We look at internal functions only on-demand but if any use is not a
+ // direct call or outside the current set of analyzed functions, we have
+ // to do it eagerly.
+ if (F->hasLocalLinkage()) {
+ if (llvm::all_of(F->uses(), [this](const Use &U) {
+ const auto *CB = dyn_cast<CallBase>(U.getUser());
+ return CB && CB->isCallee(&U) &&
+ A.isRunOn(const_cast<Function *>(CB->getCaller()));
+ }))
+ continue;
+ }
+ registerAAsForFunction(A, *F);
+ }
+}
- for (auto &I : instructions(*F)) {
- if (auto *LI = dyn_cast<LoadInst>(&I)) {
- bool UsedAssumedInformation = false;
- A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
- UsedAssumedInformation, AA::Interprocedural);
- } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
- A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
+void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {
+ if (!DisableOpenMPOptDeglobalization)
+ A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
+ A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F));
+ if (!DisableOpenMPOptDeglobalization)
+ A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(F));
+
+ for (auto &I : instructions(F)) {
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ bool UsedAssumedInformation = false;
+ A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
+ UsedAssumedInformation, AA::Interprocedural);
+ continue;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
+ continue;
+ }
+ if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+ if (II->getIntrinsicID() == Intrinsic::assume) {
+ A.getOrCreateAAFor<AAPotentialValues>(
+ IRPosition::value(*II->getArgOperand(0)));
+ continue;
}
}
}
@@ -4970,10 +5323,13 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
}
// Look at every function in the Module unless it was internalized.
+ SetVector<Function *> Functions;
SmallVector<Function *, 16> SCC;
for (Function &F : M)
- if (!F.isDeclaration() && !InternalizedMap.lookup(&F))
+ if (!F.isDeclaration() && !InternalizedMap.lookup(&F)) {
SCC.push_back(&F);
+ Functions.insert(&F);
+ }
if (SCC.empty())
return PreservedAnalyses::all();
@@ -4987,18 +5343,19 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
BumpPtrAllocator Allocator;
CallGraphUpdater CGUpdater;
- SetVector<Function *> Functions(SCC.begin(), SCC.end());
- OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels);
+ OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, Kernels);
unsigned MaxFixpointIterations =
(isOpenMPDevice(M)) ? SetFixpointIterations : 32;
AttributorConfig AC(CGUpdater);
AC.DefaultInitializeLiveInternals = false;
+ AC.IsModulePass = true;
AC.RewriteSignatures = false;
AC.MaxFixpointIterations = MaxFixpointIterations;
AC.OREGetter = OREGetter;
AC.PassName = DEBUG_TYPE;
+ AC.InitializationCallback = OpenMPOpt::registerAAsForFunction;
Attributor A(Functions, InfoCache, AC);
@@ -5062,7 +5419,7 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
SetVector<Function *> Functions(SCC.begin(), SCC.end());
OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
- /*CGSCC*/ Functions, Kernels);
+ /*CGSCC*/ &Functions, Kernels);
unsigned MaxFixpointIterations =
(isOpenMPDevice(M)) ? SetFixpointIterations : 32;
@@ -5074,6 +5431,7 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
AC.MaxFixpointIterations = MaxFixpointIterations;
AC.OREGetter = OREGetter;
AC.PassName = DEBUG_TYPE;
+ AC.InitializationCallback = OpenMPOpt::registerAAsForFunction;
Attributor A(Functions, InfoCache, AC);
@@ -5089,90 +5447,9 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
return PreservedAnalyses::all();
}
-namespace {
-
-struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass {
- CallGraphUpdater CGUpdater;
- static char ID;
-
- OpenMPOptCGSCCLegacyPass() : CallGraphSCCPass(ID) {
- initializeOpenMPOptCGSCCLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- CallGraphSCCPass::getAnalysisUsage(AU);
- }
-
- bool runOnSCC(CallGraphSCC &CGSCC) override {
- if (!containsOpenMP(CGSCC.getCallGraph().getModule()))
- return false;
- if (DisableOpenMPOptimizations || skipSCC(CGSCC))
- return false;
-
- SmallVector<Function *, 16> SCC;
- // If there are kernels in the module, we have to run on all SCC's.
- for (CallGraphNode *CGN : CGSCC) {
- Function *Fn = CGN->getFunction();
- if (!Fn || Fn->isDeclaration())
- continue;
- SCC.push_back(Fn);
- }
-
- if (SCC.empty())
- return false;
-
- Module &M = CGSCC.getCallGraph().getModule();
- KernelSet Kernels = getDeviceKernels(M);
-
- CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
- CGUpdater.initialize(CG, CGSCC);
-
- // Maintain a map of functions to avoid rebuilding the ORE
- DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
- auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
- std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
- if (!ORE)
- ORE = std::make_unique<OptimizationRemarkEmitter>(F);
- return *ORE;
- };
-
- AnalysisGetter AG;
- SetVector<Function *> Functions(SCC.begin(), SCC.end());
- BumpPtrAllocator Allocator;
- OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG,
- Allocator,
- /*CGSCC*/ Functions, Kernels);
-
- unsigned MaxFixpointIterations =
- (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
-
- AttributorConfig AC(CGUpdater);
- AC.DefaultInitializeLiveInternals = false;
- AC.IsModulePass = false;
- AC.RewriteSignatures = false;
- AC.MaxFixpointIterations = MaxFixpointIterations;
- AC.OREGetter = OREGetter;
- AC.PassName = DEBUG_TYPE;
-
- Attributor A(Functions, InfoCache, AC);
-
- OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
- bool Result = OMPOpt.run(false);
-
- if (PrintModuleAfterOptimizations)
- LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
-
- return Result;
- }
-
- bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
-};
-
-} // end anonymous namespace
-
KernelSet llvm::omp::getDeviceKernels(Module &M) {
// TODO: Create a more cross-platform way of determining device kernels.
- NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+ NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
KernelSet Kernels;
if (!MD)
@@ -5213,15 +5490,3 @@ bool llvm::omp::isOpenMPDevice(Module &M) {
return true;
}
-
-char OpenMPOptCGSCCLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",
- "OpenMP specific optimizations", false, false)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_END(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",
- "OpenMP specific optimizations", false, false)
-
-Pass *llvm::createOpenMPOptCGSCCLegacyPass() {
- return new OpenMPOptCGSCCLegacyPass();
-}