diff options
Diffstat (limited to 'llvm/lib/Transforms/IPO/OpenMPOpt.cpp')
-rw-r--r-- | llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 255 |
1 files changed, 162 insertions, 93 deletions
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 2d765fb6ce6d..227ad8501f25 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -49,7 +49,6 @@ #include "llvm/Transforms/IPO/Attributor.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" -#include "llvm/Transforms/Utils/CodeExtractor.h" #include <algorithm> @@ -59,17 +58,16 @@ using namespace omp; #define DEBUG_TYPE "openmp-opt" static cl::opt<bool> DisableOpenMPOptimizations( - "openmp-opt-disable", cl::ZeroOrMore, - cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, - cl::init(false)); + "openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), + cl::Hidden, cl::init(false)); static cl::opt<bool> EnableParallelRegionMerging( - "openmp-opt-enable-merging", cl::ZeroOrMore, + "openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false)); static cl::opt<bool> - DisableInternalization("openmp-opt-disable-internalization", cl::ZeroOrMore, + DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false)); @@ -85,42 +83,47 @@ static cl::opt<bool> HideMemoryTransferLatency( cl::Hidden, cl::init(false)); static cl::opt<bool> DisableOpenMPOptDeglobalization( - "openmp-opt-disable-deglobalization", cl::ZeroOrMore, + "openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false)); static cl::opt<bool> DisableOpenMPOptSPMDization( - "openmp-opt-disable-spmdization", cl::ZeroOrMore, + "openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false)); static cl::opt<bool> DisableOpenMPOptFolding( - "openmp-opt-disable-folding", cl::ZeroOrMore, + "openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false)); static cl::opt<bool> DisableOpenMPOptStateMachineRewrite( - "openmp-opt-disable-state-machine-rewrite", cl::ZeroOrMore, + "openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false)); static cl::opt<bool> DisableOpenMPOptBarrierElimination( - "openmp-opt-disable-barrier-elimination", cl::ZeroOrMore, + "openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false)); static cl::opt<bool> PrintModuleAfterOptimizations( - "openmp-opt-print-module", cl::ZeroOrMore, + "openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false)); +static cl::opt<bool> PrintModuleBeforeOptimizations( + "openmp-opt-print-module-before", + cl::desc("Print the current module before OpenMP optimizations."), + cl::Hidden, cl::init(false)); + static cl::opt<bool> AlwaysInlineDeviceFunctions( - "openmp-opt-inline-device", cl::ZeroOrMore, + "openmp-opt-inline-device", cl::desc("Inline all applicible functions on the device."), cl::Hidden, cl::init(false)); static cl::opt<bool> - EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::ZeroOrMore, + EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false)); @@ -129,6 +132,11 @@ static cl::opt<unsigned> cl::desc("Maximal number of attributor iterations."), cl::init(256)); +static cl::opt<unsigned> + SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, + cl::desc("Maximum amount of shared memory to use."), + cl::init(std::numeric_limits<unsigned>::max())); + STATISTIC(NumOpenMPRuntimeCallsDeduplicated, "Number of OpenMP runtime calls deduplicated"); STATISTIC(NumOpenMPParallelRegionsDeleted, @@ -493,11 +501,14 @@ struct OMPInformationCache : public InformationCache { // Remove the `noinline` attribute from `__kmpc`, `_OMP::` and `omp_` // functions, except if `optnone` is present. - for (Function &F : M) { - for (StringRef Prefix : {"__kmpc", "_ZN4_OMP", "omp_"}) - if (F.getName().startswith(Prefix) && - !F.hasFnAttribute(Attribute::OptimizeNone)) - F.removeFnAttr(Attribute::NoInline); + if (isOpenMPDevice(M)) { + for (Function &F : M) { + for (StringRef Prefix : {"__kmpc", "_ZN4_OMP", "omp_"}) + if (F.hasFnAttribute(Attribute::NoInline) && + F.getName().startswith(Prefix) && + !F.hasFnAttribute(Attribute::OptimizeNone)) + F.removeFnAttr(Attribute::NoInline); + } } // TODO: We should attach the attributes defined in OMPKinds.def. @@ -591,7 +602,7 @@ struct KernelInfoState : AbstractState { /// Abstract State interface ///{ - KernelInfoState() {} + KernelInfoState() = default; KernelInfoState(bool BestState) { if (!BestState) indicatePessimisticFixpoint(); @@ -926,8 +937,7 @@ private: SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap; BasicBlock *StartBB = nullptr, *EndBB = nullptr; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, - BasicBlock &ContinuationIP) { + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { BasicBlock *CGStartBB = CodeGenIP.getBlock(); BasicBlock *CGEndBB = SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); @@ -966,8 +976,7 @@ private: const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc(); ParentBB->getTerminator()->eraseFromParent(); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, - BasicBlock &ContinuationIP) { + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { BasicBlock *CGStartBB = CodeGenIP.getBlock(); BasicBlock *CGEndBB = SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); @@ -1107,10 +1116,8 @@ private: // callbacks. SmallVector<Value *, 8> Args; for (auto *CI : MergableCIs) { - Value *Callee = - CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts(); - FunctionType *FT = - cast<FunctionType>(Callee->getType()->getPointerElementType()); + Value *Callee = CI->getArgOperand(CallbackCalleeOperand); + FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask; Args.clear(); Args.push_back(OutlinedFn->getArg(0)); Args.push_back(OutlinedFn->getArg(1)); @@ -1458,7 +1465,6 @@ private: case Intrinsic::nvvm_barrier0_and: case Intrinsic::nvvm_barrier0_or: case Intrinsic::nvvm_barrier0_popc: - case Intrinsic::amdgcn_s_barrier: return true; default: break; @@ -2120,6 +2126,8 @@ private: OMPRTL___kmpc_barrier_simple_generic); ExternalizationRAII ThreadId(OMPInfoCache, OMPRTL___kmpc_get_hardware_thread_id_in_block); + ExternalizationRAII NumThreads( + OMPInfoCache, OMPRTL___kmpc_get_hardware_num_threads_in_block); ExternalizationRAII WarpSize(OMPInfoCache, OMPRTL___kmpc_get_warp_size); registerAAs(IsModulePass); @@ -2407,8 +2415,7 @@ struct AAICVTrackerFunction : public AAICVTracker { auto CallCheck = [&](Instruction &I) { Optional<Value *> ReplVal = getValueForCall(A, I, ICV); - if (ReplVal.hasValue() && - ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) + if (ReplVal && ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) HasChanged = ChangeStatus::CHANGED; return true; @@ -2468,7 +2475,8 @@ struct AAICVTrackerFunction : public AAICVTracker { if (ICVTrackingAA.isAssumedTracked()) { Optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV); - if (!URV || (*URV && AA::isValidAtPosition(**URV, I, OMPInfoCache))) + if (!URV || (*URV && AA::isValidAtPosition(AA::ValueAndContext(**URV, I), + OMPInfoCache))) return URV; } @@ -2509,13 +2517,13 @@ struct AAICVTrackerFunction : public AAICVTracker { if (ValuesMap.count(CurrInst)) { Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst); // Unknown value, track new. - if (!ReplVal.hasValue()) { + if (!ReplVal) { ReplVal = NewReplVal; break; } // If we found a new value, we can't know the icv value anymore. - if (NewReplVal.hasValue()) + if (NewReplVal) if (ReplVal != NewReplVal) return nullptr; @@ -2523,11 +2531,11 @@ struct AAICVTrackerFunction : public AAICVTracker { } Optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV); - if (!NewReplVal.hasValue()) + if (!NewReplVal) continue; // Unknown value, track new. - if (!ReplVal.hasValue()) { + if (!ReplVal) { ReplVal = NewReplVal; break; } @@ -2539,7 +2547,7 @@ struct AAICVTrackerFunction : public AAICVTracker { } // If we are in the same BB and we have a value, we are done. - if (CurrBB == I->getParent() && ReplVal.hasValue()) + if (CurrBB == I->getParent() && ReplVal) return ReplVal; // Go through all predecessors and add terminators for analysis. @@ -2597,7 +2605,7 @@ struct AAICVTrackerFunctionReturned : AAICVTracker { ICVTrackingAA.getReplacementValue(ICV, &I, A); // If we found a second ICV value there is no unique returned value. - if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal) + if (UniqueICVValue && UniqueICVValue != NewReplVal) return false; UniqueICVValue = NewReplVal; @@ -2648,10 +2656,10 @@ struct AAICVTrackerCallSite : AAICVTracker { } ChangeStatus manifest(Attributor &A) override { - if (!ReplVal.hasValue() || !ReplVal.getValue()) + if (!ReplVal || !*ReplVal) return ChangeStatus::UNCHANGED; - A.changeValueAfterManifest(*getCtxI(), **ReplVal); + A.changeAfterManifest(IRPosition::inst(*getCtxI()), **ReplVal); A.deleteAfterManifest(*getCtxI()); return ChangeStatus::CHANGED; @@ -2789,7 +2797,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs; /// Total number of basic blocks in this function. - long unsigned NumBBs; + long unsigned NumBBs = 0; }; ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { @@ -2952,12 +2960,23 @@ struct AAHeapToSharedFunction : public AAHeapToShared { } void initialize(Attributor &A) override { + if (DisableOpenMPOptDeglobalization) { + indicatePessimisticFixpoint(); + return; + } + auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; + Attributor::SimplifictionCallbackTy SCB = + [](const IRPosition &, const AbstractAttribute *, + bool &) -> Optional<Value *> { return nullptr; }; for (User *U : RFI.Declaration->users()) - if (CallBase *CB = dyn_cast<CallBase>(U)) + if (CallBase *CB = dyn_cast<CallBase>(U)) { MallocCalls.insert(CB); + A.registerSimplificationCallback(IRPosition::callsite_returned(*CB), + SCB); + } findPotentialRemovedFreeCalls(A); } @@ -2999,6 +3018,14 @@ struct AAHeapToSharedFunction : public AAHeapToShared { auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0)); + if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) { + LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB + << " with shared memory." + << " Shared memory usage is limited to " + << SharedMemoryLimit << " bytes\n"); + continue; + } + LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB << " with " << AllocSize->getZExtValue() << " bytes of shared memory\n"); @@ -3029,11 +3056,12 @@ struct AAHeapToSharedFunction : public AAHeapToShared { "HeapToShared on allocation without alignment attribute"); SharedMem->setAlignment(MaybeAlign(Alignment)); - A.changeValueAfterManifest(*CB, *NewBuffer); + A.changeAfterManifest(IRPosition::callsite_returned(*CB), *NewBuffer); A.deleteAfterManifest(*CB); A.deleteAfterManifest(*FreeCalls.front()); - NumBytesMovedToSharedMemory += AllocSize->getZExtValue(); + SharedMemoryUsed += AllocSize->getZExtValue(); + NumBytesMovedToSharedMemory = SharedMemoryUsed; Changed = ChangeStatus::CHANGED; } @@ -3069,6 +3097,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared { SmallSetVector<CallBase *, 4> MallocCalls; /// Collection of potentially removed free calls in a function. SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls; + /// The total amount of shared memory that has been used for HeapToShared. + unsigned SharedMemoryUsed = 0; }; struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> { @@ -3137,12 +3167,6 @@ struct AAKernelInfoFunction : AAKernelInfo { auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); Function *Fn = getAnchorScope(); - if (!OMPInfoCache.Kernels.count(Fn)) - return; - - // Add itself to the reaching kernel and set IsKernelEntry. - ReachingKernelEntries.insert(Fn); - IsKernelEntry = true; OMPInformationCache::RuntimeFunctionInfo &InitRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; @@ -3176,10 +3200,12 @@ struct AAKernelInfoFunction : AAKernelInfo { Fn); // Ignore kernels without initializers such as global constructors. - if (!KernelInitCB || !KernelDeinitCB) { - indicateOptimisticFixpoint(); + if (!KernelInitCB || !KernelDeinitCB) return; - } + + // Add itself to the reaching kernel and set IsKernelEntry. + ReachingKernelEntries.insert(Fn); + IsKernelEntry = true; // For kernels we might need to initialize/finalize the IsSPMD state and // we need to register a simplification callback so that the Attributor @@ -3345,8 +3371,17 @@ struct AAKernelInfoFunction : AAKernelInfo { return false; } - // Check if the kernel is already in SPMD mode, if so, return success. + // Get the actual kernel, could be the caller of the anchor scope if we have + // a debug wrapper. Function *Kernel = getAnchorScope(); + if (Kernel->hasLocalLinkage()) { + assert(Kernel->hasOneUse() && "Unexpected use of debug kernel wrapper."); + auto *CB = cast<CallBase>(Kernel->user_back()); + Kernel = CB->getCaller(); + } + assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!"); + + // Check if the kernel is already in SPMD mode, if so, return success. GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( (Kernel->getName() + "_exec_mode").str()); assert(ExecMode && "Kernel without exec mode?"); @@ -3711,9 +3746,9 @@ struct AAKernelInfoFunction : AAKernelInfo { // __kmpc_get_hardware_num_threads_in_block(); // WarpSize = __kmpc_get_warp_size(); // BlockSize = BlockHwSize - WarpSize; - // if (InitCB >= BlockSize) return; - // IsWorkerCheckBB: bool IsWorker = InitCB >= 0; + // IsWorkerCheckBB: bool IsWorker = InitCB != -1; // if (IsWorker) { + // if (InitCB >= BlockSize) return; // SMBeginBB: __kmpc_barrier_simple_generic(...); // void *WorkFn; // bool Active = __kmpc_kernel_parallel(&WorkFn); @@ -3770,6 +3805,13 @@ struct AAKernelInfoFunction : AAKernelInfo { ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc); InitBB->getTerminator()->eraseFromParent(); + Instruction *IsWorker = + ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB, + ConstantInt::get(KernelInitCB->getType(), -1), + "thread.is_worker", InitBB); + IsWorker->setDebugLoc(DLoc); + BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB); + Module &M = *Kernel->getParent(); auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); FunctionCallee BlockHwSizeFn = @@ -3779,29 +3821,22 @@ struct AAKernelInfoFunction : AAKernelInfo { OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_get_warp_size); CallInst *BlockHwSize = - CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB); + CallInst::Create(BlockHwSizeFn, "block.hw_size", IsWorkerCheckBB); OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize); BlockHwSize->setDebugLoc(DLoc); - CallInst *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB); + CallInst *WarpSize = + CallInst::Create(WarpSizeFn, "warp.size", IsWorkerCheckBB); OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize); WarpSize->setDebugLoc(DLoc); - Instruction *BlockSize = - BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB); + Instruction *BlockSize = BinaryOperator::CreateSub( + BlockHwSize, WarpSize, "block.size", IsWorkerCheckBB); BlockSize->setDebugLoc(DLoc); - Instruction *IsMainOrWorker = - ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB, - BlockSize, "thread.is_main_or_worker", InitBB); + Instruction *IsMainOrWorker = ICmpInst::Create( + ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB, BlockSize, + "thread.is_main_or_worker", IsWorkerCheckBB); IsMainOrWorker->setDebugLoc(DLoc); - BranchInst::Create(IsWorkerCheckBB, StateMachineFinishedBB, IsMainOrWorker, - InitBB); - - Instruction *IsWorker = - ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB, - ConstantInt::get(KernelInitCB->getType(), -1), - "thread.is_worker", IsWorkerCheckBB); - IsWorker->setDebugLoc(DLoc); - BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, - IsWorkerCheckBB); + BranchInst::Create(StateMachineBeginBB, StateMachineFinishedBB, + IsMainOrWorker, IsWorkerCheckBB); // Create local storage for the work function pointer. const DataLayout &DL = M.getDataLayout(); @@ -4241,10 +4276,10 @@ struct AAKernelInfoCallSite : AAKernelInfo { unsigned ScheduleTypeVal = ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0; switch (OMPScheduleType(ScheduleTypeVal)) { - case OMPScheduleType::Static: - case OMPScheduleType::StaticChunked: - case OMPScheduleType::Distribute: - case OMPScheduleType::DistributeChunked: + case OMPScheduleType::UnorderedStatic: + case OMPScheduleType::UnorderedStaticChunked: + case OMPScheduleType::OrderedDistribute: + case OMPScheduleType::OrderedDistributeChunked: break; default: SPMDCompatibilityTracker.indicatePessimisticFixpoint(); @@ -4390,7 +4425,7 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { std::string Str("simplified value: "); - if (!SimplifiedValue.hasValue()) + if (!SimplifiedValue) return Str + std::string("none"); if (!SimplifiedValue.getValue()) @@ -4420,8 +4455,8 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { IRPosition::callsite_returned(CB), [&](const IRPosition &IRP, const AbstractAttribute *AA, bool &UsedAssumedInformation) -> Optional<Value *> { - assert((isValidState() || (SimplifiedValue.hasValue() && - SimplifiedValue.getValue() == nullptr)) && + assert((isValidState() || + (SimplifiedValue && SimplifiedValue.getValue() == nullptr)) && "Unexpected invalid state!"); if (!isAtFixpoint()) { @@ -4461,9 +4496,9 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { ChangeStatus manifest(Attributor &A) override { ChangeStatus Changed = ChangeStatus::UNCHANGED; - if (SimplifiedValue.hasValue() && SimplifiedValue.getValue()) { + if (SimplifiedValue && *SimplifiedValue) { Instruction &I = *getCtxI(); - A.changeValueAfterManifest(I, **SimplifiedValue); + A.changeAfterManifest(IRPosition::inst(I), **SimplifiedValue); A.deleteAfterManifest(I); CallBase *CB = dyn_cast<CallBase>(&I); @@ -4549,7 +4584,7 @@ private: // We have empty reaching kernels, therefore we cannot tell if the // associated call site can be folded. At this moment, SimplifiedValue // must be none. - assert(!SimplifiedValue.hasValue() && "SimplifiedValue should be none"); + assert(!SimplifiedValue && "SimplifiedValue should be none"); } return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED @@ -4592,7 +4627,7 @@ private: return indicatePessimisticFixpoint(); if (CallerKernelInfoAA.ReachingKernelEntries.empty()) { - assert(!SimplifiedValue.hasValue() && + assert(!SimplifiedValue && "SimplifiedValue should keep none at this point"); return ChangeStatus::UNCHANGED; } @@ -4700,18 +4735,23 @@ void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) { void OpenMPOpt::registerAAs(bool IsModulePass) { if (SCC.empty()) - return; + if (IsModulePass) { // Ensure we create the AAKernelInfo AAs first and without triggering an // update. This will make sure we register all value simplification // callbacks before any other AA has the chance to create an AAValueSimplify // or similar. - for (Function *Kernel : OMPInfoCache.Kernels) + auto CreateKernelInfoCB = [&](Use &, Function &Kernel) { A.getOrCreateAAFor<AAKernelInfo>( - IRPosition::function(*Kernel), /* QueryingAA */ nullptr, + IRPosition::function(Kernel), /* QueryingAA */ nullptr, DepClassTy::NONE, /* ForceUpdate */ false, /* UpdateAfterInit */ false); + return false; + }; + OMPInformationCache::RuntimeFunctionInfo &InitRFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; + InitRFI.foreachUse(SCC, CreateKernelInfoCB); registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id); registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode); @@ -4899,6 +4939,9 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); KernelSet Kernels = getDeviceKernels(M); + if (PrintModuleBeforeOptimizations) + LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt Module Pass:\n" << M); + auto IsCalled = [&](Function &F) { if (Kernels.contains(&F)) return true; @@ -4958,8 +5001,15 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? SetFixpointIterations : 32; - Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false, - MaxFixpointIterations, OREGetter, DEBUG_TYPE); + + AttributorConfig AC(CGUpdater); + AC.DefaultInitializeLiveInternals = false; + AC.RewriteSignatures = false; + AC.MaxFixpointIterations = MaxFixpointIterations; + AC.OREGetter = OREGetter; + AC.PassName = DEBUG_TYPE; + + Attributor A(Functions, InfoCache, AC); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); bool Changed = OMPOpt.run(true); @@ -5001,6 +5051,9 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, Module &M = *C.begin()->getFunction().getParent(); + if (PrintModuleBeforeOptimizations) + LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt CGSCC Pass:\n" << M); + KernelSet Kernels = getDeviceKernels(M); FunctionAnalysisManager &FAM = @@ -5022,8 +5075,16 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? SetFixpointIterations : 32; - Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true, - MaxFixpointIterations, OREGetter, DEBUG_TYPE); + + AttributorConfig AC(CGUpdater); + AC.DefaultInitializeLiveInternals = false; + AC.IsModulePass = false; + AC.RewriteSignatures = false; + AC.MaxFixpointIterations = MaxFixpointIterations; + AC.OREGetter = OREGetter; + AC.PassName = DEBUG_TYPE; + + Attributor A(Functions, InfoCache, AC); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); bool Changed = OMPOpt.run(false); @@ -5093,8 +5154,16 @@ struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass { unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? SetFixpointIterations : 32; - Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true, - MaxFixpointIterations, OREGetter, DEBUG_TYPE); + + AttributorConfig AC(CGUpdater); + AC.DefaultInitializeLiveInternals = false; + AC.IsModulePass = false; + AC.RewriteSignatures = false; + AC.MaxFixpointIterations = MaxFixpointIterations; + AC.OREGetter = OREGetter; + AC.PassName = DEBUG_TYPE; + + Attributor A(Functions, InfoCache, AC); OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); bool Result = OMPOpt.run(false); |