diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:03:47 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:04:23 +0000 |
commit | 7fa27ce4a07f19b07799a767fc29416f3b625afb (patch) | |
tree | 27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Coroutines/CoroFrame.cpp | |
parent | e3b557809604d036af6e00c60f012c2025b59a5e (diff) |
Diffstat (limited to 'llvm/lib/Transforms/Coroutines/CoroFrame.cpp')
-rw-r--r-- | llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 608 |
1 files changed, 418 insertions, 190 deletions
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index e98c601648e0..1f373270f951 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -16,6 +16,7 @@ #include "CoroInternal.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallString.h" #include "llvm/Analysis/PtrUseVisitor.h" @@ -37,6 +38,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include <algorithm> +#include <deque> #include <optional> using namespace llvm; @@ -87,7 +89,7 @@ public: // crosses a suspend point. // namespace { -struct SuspendCrossingInfo { +class SuspendCrossingInfo { BlockToIndexMapping Mapping; struct BlockData { @@ -96,20 +98,30 @@ struct SuspendCrossingInfo { bool Suspend = false; bool End = false; bool KillLoop = false; + bool Changed = false; }; SmallVector<BlockData, SmallVectorThreshold> Block; - iterator_range<succ_iterator> successors(BlockData const &BD) const { + iterator_range<pred_iterator> predecessors(BlockData const &BD) const { BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]); - return llvm::successors(BB); + return llvm::predecessors(BB); } BlockData &getBlockData(BasicBlock *BB) { return Block[Mapping.blockToIndex(BB)]; } + /// Compute the BlockData for the current function in one iteration. + /// Returns whether the BlockData changes in this iteration. + /// Initialize - Whether this is the first iteration, we can optimize + /// the initial case a little bit by manual loop switch. + template <bool Initialize = false> bool computeBlockData(); + +public: +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump() const; void dump(StringRef Label, BitVector const &BV) const; +#endif SuspendCrossingInfo(Function &F, coro::Shape &Shape); @@ -211,6 +223,72 @@ LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const { } #endif +template <bool Initialize> bool SuspendCrossingInfo::computeBlockData() { + const size_t N = Mapping.size(); + bool Changed = false; + + for (size_t I = 0; I < N; ++I) { + auto &B = Block[I]; + + // We don't need to count the predecessors when initialization. + if constexpr (!Initialize) + // If all the predecessors of the current Block don't change, + // the BlockData for the current block must not change too. + if (all_of(predecessors(B), [this](BasicBlock *BB) { + return !Block[Mapping.blockToIndex(BB)].Changed; + })) { + B.Changed = false; + continue; + } + + // Saved Consumes and Kills bitsets so that it is easy to see + // if anything changed after propagation. + auto SavedConsumes = B.Consumes; + auto SavedKills = B.Kills; + + for (BasicBlock *PI : predecessors(B)) { + auto PrevNo = Mapping.blockToIndex(PI); + auto &P = Block[PrevNo]; + + // Propagate Kills and Consumes from predecessors into B. + B.Consumes |= P.Consumes; + B.Kills |= P.Kills; + + // If block P is a suspend block, it should propagate kills into block + // B for every block P consumes. + if (P.Suspend) + B.Kills |= P.Consumes; + } + + if (B.Suspend) { + // If block S is a suspend block, it should kill all of the blocks it + // consumes. + B.Kills |= B.Consumes; + } else if (B.End) { + // If block B is an end block, it should not propagate kills as the + // blocks following coro.end() are reached during initial invocation + // of the coroutine while all the data are still available on the + // stack or in the registers. + B.Kills.reset(); + } else { + // This is reached when B block it not Suspend nor coro.end and it + // need to make sure that it is not in the kill set. + B.KillLoop |= B.Kills[I]; + B.Kills.reset(I); + } + + if constexpr (!Initialize) { + B.Changed = (B.Kills != SavedKills) || (B.Consumes != SavedConsumes); + Changed |= B.Changed; + } + } + + if constexpr (Initialize) + return true; + + return Changed; +} + SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) : Mapping(F) { const size_t N = Mapping.size(); @@ -222,6 +300,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) B.Consumes.resize(N); B.Kills.resize(N); B.Consumes.set(I); + B.Changed = true; } // Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as @@ -246,73 +325,123 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) markSuspendBlock(Save); } - // Iterate propagating consumes and kills until they stop changing. - int Iteration = 0; - (void)Iteration; + computeBlockData</*Initialize=*/true>(); - bool Changed; - do { - LLVM_DEBUG(dbgs() << "iteration " << ++Iteration); - LLVM_DEBUG(dbgs() << "==============\n"); - - Changed = false; - for (size_t I = 0; I < N; ++I) { - auto &B = Block[I]; - for (BasicBlock *SI : successors(B)) { - - auto SuccNo = Mapping.blockToIndex(SI); - - // Saved Consumes and Kills bitsets so that it is easy to see - // if anything changed after propagation. - auto &S = Block[SuccNo]; - auto SavedConsumes = S.Consumes; - auto SavedKills = S.Kills; - - // Propagate Kills and Consumes from block B into its successor S. - S.Consumes |= B.Consumes; - S.Kills |= B.Kills; - - // If block B is a suspend block, it should propagate kills into the - // its successor for every block B consumes. - if (B.Suspend) { - S.Kills |= B.Consumes; - } - if (S.Suspend) { - // If block S is a suspend block, it should kill all of the blocks it - // consumes. - S.Kills |= S.Consumes; - } else if (S.End) { - // If block S is an end block, it should not propagate kills as the - // blocks following coro.end() are reached during initial invocation - // of the coroutine while all the data are still available on the - // stack or in the registers. - S.Kills.reset(); - } else { - // This is reached when S block it not Suspend nor coro.end and it - // need to make sure that it is not in the kill set. - S.KillLoop |= S.Kills[SuccNo]; - S.Kills.reset(SuccNo); - } + while (computeBlockData()) + ; + + LLVM_DEBUG(dump()); +} - // See if anything changed. - Changed |= (S.Kills != SavedKills) || (S.Consumes != SavedConsumes); +namespace { - if (S.Kills != SavedKills) { - LLVM_DEBUG(dbgs() << "\nblock " << I << " follower " << SI->getName() - << "\n"); - LLVM_DEBUG(dump("S.Kills", S.Kills)); - LLVM_DEBUG(dump("SavedKills", SavedKills)); - } - if (S.Consumes != SavedConsumes) { - LLVM_DEBUG(dbgs() << "\nblock " << I << " follower " << SI << "\n"); - LLVM_DEBUG(dump("S.Consume", S.Consumes)); - LLVM_DEBUG(dump("SavedCons", SavedConsumes)); +// RematGraph is used to construct a DAG for rematerializable instructions +// When the constructor is invoked with a candidate instruction (which is +// materializable) it builds a DAG of materializable instructions from that +// point. +// Typically, for each instruction identified as re-materializable across a +// suspend point, a RematGraph will be created. +struct RematGraph { + // Each RematNode in the graph contains the edges to instructions providing + // operands in the current node. + struct RematNode { + Instruction *Node; + SmallVector<RematNode *> Operands; + RematNode() = default; + RematNode(Instruction *V) : Node(V) {} + }; + + RematNode *EntryNode; + using RematNodeMap = + SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>; + RematNodeMap Remats; + const std::function<bool(Instruction &)> &MaterializableCallback; + SuspendCrossingInfo &Checker; + + RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback, + Instruction *I, SuspendCrossingInfo &Checker) + : MaterializableCallback(MaterializableCallback), Checker(Checker) { + std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I); + EntryNode = FirstNode.get(); + std::deque<std::unique_ptr<RematNode>> WorkList; + addNode(std::move(FirstNode), WorkList, cast<User>(I)); + while (WorkList.size()) { + std::unique_ptr<RematNode> N = std::move(WorkList.front()); + WorkList.pop_front(); + addNode(std::move(N), WorkList, cast<User>(I)); + } + } + + void addNode(std::unique_ptr<RematNode> NUPtr, + std::deque<std::unique_ptr<RematNode>> &WorkList, + User *FirstUse) { + RematNode *N = NUPtr.get(); + if (Remats.count(N->Node)) + return; + + // We haven't see this node yet - add to the list + Remats[N->Node] = std::move(NUPtr); + for (auto &Def : N->Node->operands()) { + Instruction *D = dyn_cast<Instruction>(Def.get()); + if (!D || !MaterializableCallback(*D) || + !Checker.isDefinitionAcrossSuspend(*D, FirstUse)) + continue; + + if (Remats.count(D)) { + // Already have this in the graph + N->Operands.push_back(Remats[D].get()); + continue; + } + + bool NoMatch = true; + for (auto &I : WorkList) { + if (I->Node == D) { + NoMatch = false; + N->Operands.push_back(I.get()); + break; } } + if (NoMatch) { + // Create a new node + std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D); + N->Operands.push_back(ChildNode.get()); + WorkList.push_back(std::move(ChildNode)); + } } - } while (Changed); - LLVM_DEBUG(dump()); -} + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() const { + dbgs() << "Entry ("; + if (EntryNode->Node->getParent()->hasName()) + dbgs() << EntryNode->Node->getParent()->getName(); + else + EntryNode->Node->getParent()->printAsOperand(dbgs(), false); + dbgs() << ") : " << *EntryNode->Node << "\n"; + for (auto &E : Remats) { + dbgs() << *(E.first) << "\n"; + for (RematNode *U : E.second->Operands) + dbgs() << " " << *U->Node << "\n"; + } + } +#endif +}; +} // end anonymous namespace + +namespace llvm { + +template <> struct GraphTraits<RematGraph *> { + using NodeRef = RematGraph::RematNode *; + using ChildIteratorType = RematGraph::RematNode **; + + static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; } + static ChildIteratorType child_begin(NodeRef N) { + return N->Operands.begin(); + } + static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); } +}; + +} // end namespace llvm #undef DEBUG_TYPE // "coro-suspend-crossing" #define DEBUG_TYPE "coro-frame" @@ -425,6 +554,15 @@ static void dumpSpills(StringRef Title, const SpillInfo &Spills) { I->dump(); } } +static void dumpRemats( + StringRef Title, + const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) { + dbgs() << "------------- " << Title << "--------------\n"; + for (const auto &E : RM) { + E.second->dump(); + dbgs() << "--\n"; + } +} static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) { dbgs() << "------------- Allocas --------------\n"; @@ -637,10 +775,10 @@ void FrameTypeBuilder::addFieldForAllocas(const Function &F, return; } - // Because there are pathes from the lifetime.start to coro.end + // Because there are paths from the lifetime.start to coro.end // for each alloca, the liferanges for every alloca is overlaped // in the blocks who contain coro.end and the successor blocks. - // So we choose to skip there blocks when we calculates the liferange + // So we choose to skip there blocks when we calculate the liferange // for each alloca. It should be reasonable since there shouldn't be uses // in these blocks and the coroutine frame shouldn't be used outside the // coroutine body. @@ -820,7 +958,7 @@ void FrameTypeBuilder::finish(StructType *Ty) { static void cacheDIVar(FrameDataInfo &FrameData, DenseMap<Value *, DILocalVariable *> &DIVarCache) { for (auto *V : FrameData.getAllDefs()) { - if (DIVarCache.find(V) != DIVarCache.end()) + if (DIVarCache.contains(V)) continue; auto DDIs = FindDbgDeclareUses(V); @@ -852,18 +990,8 @@ static StringRef solveTypeName(Type *Ty) { return "__floating_type_"; } - if (auto *PtrTy = dyn_cast<PointerType>(Ty)) { - if (PtrTy->isOpaque()) - return "PointerType"; - Type *PointeeTy = PtrTy->getNonOpaquePointerElementType(); - auto Name = solveTypeName(PointeeTy); - if (Name == "UnknownType") - return "PointerType"; - SmallString<16> Buffer; - Twine(Name + "_Ptr").toStringRef(Buffer); - auto *MDName = MDString::get(Ty->getContext(), Buffer.str()); - return MDName->getString(); - } + if (Ty->isPointerTy()) + return "PointerType"; if (Ty->isStructTy()) { if (!cast<StructType>(Ty)->hasName()) @@ -1043,7 +1171,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, dwarf::DW_ATE_unsigned_char)}); for (auto *V : FrameData.getAllDefs()) { - if (DIVarCache.find(V) == DIVarCache.end()) + if (!DIVarCache.contains(V)) continue; auto Index = FrameData.getFieldIndex(V); @@ -1075,7 +1203,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, // fields confilicts with each other. unsigned UnknownTypeNum = 0; for (unsigned Index = 0; Index < FrameTy->getNumElements(); Index++) { - if (OffsetCache.find(Index) == OffsetCache.end()) + if (!OffsetCache.contains(Index)) continue; std::string Name; @@ -1090,7 +1218,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, AlignInBits = OffsetCache[Index].first * 8; OffsetInBits = OffsetCache[Index].second * 8; - if (NameCache.find(Index) != NameCache.end()) { + if (NameCache.contains(Index)) { Name = NameCache[Index].str(); DITy = TyCache[Index]; } else { @@ -1282,7 +1410,7 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape, // function call or any of the memory intrinsics, we check whether this // instruction is prior to CoroBegin. To answer question 3, we track the offsets // of all aliases created for the alloca prior to CoroBegin but used after -// CoroBegin. llvm::Optional is used to be able to represent the case when the +// CoroBegin. std::optional is used to be able to represent the case when the // offset is unknown (e.g. when you have a PHINode that takes in different // offset values). We cannot handle unknown offsets and will assert. This is the // potential issue left out. An ideal solution would likely require a @@ -1586,11 +1714,12 @@ static void createFramePtr(coro::Shape &Shape) { static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { auto *CB = Shape.CoroBegin; LLVMContext &C = CB->getContext(); + Function *F = CB->getFunction(); IRBuilder<> Builder(C); StructType *FrameTy = Shape.FrameTy; Value *FramePtr = Shape.FramePtr; - DominatorTree DT(*CB->getFunction()); - SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> DbgPtrAllocaCache; + DominatorTree DT(*F); + SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap; // Create a GEP with the given index into the coroutine frame for the original // value Orig. Appends an extra 0 index for array-allocas, preserving the @@ -1723,6 +1852,21 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { SpillAlignment, E.first->getName() + Twine(".reload")); TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(Def); + // Try best to find dbg.declare. If the spill is a temp, there may not + // be a direct dbg.declare. Walk up the load chain to find one from an + // alias. + if (F->getSubprogram()) { + auto *CurDef = Def; + while (DIs.empty() && isa<LoadInst>(CurDef)) { + auto *LdInst = cast<LoadInst>(CurDef); + // Only consider ptr to ptr same type load. + if (LdInst->getPointerOperandType() != LdInst->getType()) + break; + CurDef = LdInst->getPointerOperand(); + DIs = FindDbgDeclareUses(CurDef); + } + } + for (DbgDeclareInst *DDI : DIs) { bool AllowUnresolved = false; // This dbg.declare is preserved for all coro-split function @@ -1734,16 +1878,10 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { &*Builder.GetInsertPoint()); // This dbg.declare is for the main function entry point. It // will be deleted in all coro-split functions. - coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.OptimizeFrame); + coro::salvageDebugInfo(ArgToAllocaMap, DDI, Shape.OptimizeFrame); } } - // Salvage debug info on any dbg.addr that we see. We do not insert them - // into each block where we have a use though. - if (auto *DI = dyn_cast<DbgAddrIntrinsic>(U)) { - coro::salvageDebugInfo(DbgPtrAllocaCache, DI, Shape.OptimizeFrame); - } - // If we have a single edge PHINode, remove it and replace it with a // reload from the coroutine frame. (We already took care of multi edge // PHINodes by rewriting them in the rewritePHIs function). @@ -1813,11 +1951,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { DVI->replaceUsesOfWith(Alloca, G); for (Instruction *I : UsersToUpdate) { - // It is meaningless to remain the lifetime intrinsics refer for the + // It is meaningless to retain the lifetime intrinsics refer for the // member of coroutine frames and the meaningless lifetime intrinsics // are possible to block further optimizations. - if (I->isLifetimeStartOrEnd()) + if (I->isLifetimeStartOrEnd()) { + I->eraseFromParent(); continue; + } I->replaceUsesOfWith(Alloca, G); } @@ -2089,11 +2229,12 @@ static void rewritePHIs(Function &F) { rewritePHIs(*BB); } +/// Default materializable callback // Check for instructions that we can recreate on resume as opposed to spill // the result into a coroutine frame. -static bool materializable(Instruction &V) { - return isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) || - isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V); +bool coro::defaultMaterializable(Instruction &V) { + return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) || + isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V)); } // Check for structural coroutine intrinsics that should not be spilled into @@ -2103,41 +2244,82 @@ static bool isCoroutineStructureIntrinsic(Instruction &I) { isa<CoroSuspendInst>(&I); } -// For every use of the value that is across suspend point, recreate that value -// after a suspend point. -static void rewriteMaterializableInstructions(IRBuilder<> &IRB, - const SpillInfo &Spills) { - for (const auto &E : Spills) { - Value *Def = E.first; - BasicBlock *CurrentBlock = nullptr; +// For each instruction identified as materializable across the suspend point, +// and its associated DAG of other rematerializable instructions, +// recreate the DAG of instructions after the suspend point. +static void rewriteMaterializableInstructions( + const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> + &AllRemats) { + // This has to be done in 2 phases + // Do the remats and record the required defs to be replaced in the + // original use instructions + // Once all the remats are complete, replace the uses in the final + // instructions with the new defs + typedef struct { + Instruction *Use; + Instruction *Def; + Instruction *Remat; + } ProcessNode; + + SmallVector<ProcessNode> FinalInstructionsToProcess; + + for (const auto &E : AllRemats) { + Instruction *Use = E.first; Instruction *CurrentMaterialization = nullptr; - for (Instruction *U : E.second) { - // If we have not seen this block, materialize the value. - if (CurrentBlock != U->getParent()) { + RematGraph *RG = E.second.get(); + ReversePostOrderTraversal<RematGraph *> RPOT(RG); + SmallVector<Instruction *> InstructionsToProcess; + + // If the target use is actually a suspend instruction then we have to + // insert the remats into the end of the predecessor (there should only be + // one). This is so that suspend blocks always have the suspend instruction + // as the first instruction. + auto InsertPoint = &*Use->getParent()->getFirstInsertionPt(); + if (isa<AnyCoroSuspendInst>(Use)) { + BasicBlock *SuspendPredecessorBlock = + Use->getParent()->getSinglePredecessor(); + assert(SuspendPredecessorBlock && "malformed coro suspend instruction"); + InsertPoint = SuspendPredecessorBlock->getTerminator(); + } - bool IsInCoroSuspendBlock = isa<AnyCoroSuspendInst>(U); - CurrentBlock = U->getParent(); - auto *InsertBlock = IsInCoroSuspendBlock - ? CurrentBlock->getSinglePredecessor() - : CurrentBlock; - CurrentMaterialization = cast<Instruction>(Def)->clone(); - CurrentMaterialization->setName(Def->getName()); - CurrentMaterialization->insertBefore( - IsInCoroSuspendBlock ? InsertBlock->getTerminator() - : &*InsertBlock->getFirstInsertionPt()); - } - if (auto *PN = dyn_cast<PHINode>(U)) { - assert(PN->getNumIncomingValues() == 1 && - "unexpected number of incoming " - "values in the PHINode"); - PN->replaceAllUsesWith(CurrentMaterialization); - PN->eraseFromParent(); - continue; - } - // Replace all uses of Def in the current instruction with the - // CurrentMaterialization for the block. - U->replaceUsesOfWith(Def, CurrentMaterialization); + // Note: skip the first instruction as this is the actual use that we're + // rematerializing everything for. + auto I = RPOT.begin(); + ++I; + for (; I != RPOT.end(); ++I) { + Instruction *D = (*I)->Node; + CurrentMaterialization = D->clone(); + CurrentMaterialization->setName(D->getName()); + CurrentMaterialization->insertBefore(InsertPoint); + InsertPoint = CurrentMaterialization; + + // Replace all uses of Def in the instructions being added as part of this + // rematerialization group + for (auto &I : InstructionsToProcess) + I->replaceUsesOfWith(D, CurrentMaterialization); + + // Don't replace the final use at this point as this can cause problems + // for other materializations. Instead, for any final use that uses a + // define that's being rematerialized, record the replace values + for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i) + if (Use->getOperand(i) == D) // Is this operand pointing to oldval? + FinalInstructionsToProcess.push_back( + {Use, D, CurrentMaterialization}); + + InstructionsToProcess.push_back(CurrentMaterialization); + } + } + + // Finally, replace the uses with the defines that we've just rematerialized + for (auto &R : FinalInstructionsToProcess) { + if (auto *PN = dyn_cast<PHINode>(R.Use)) { + assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming " + "values in the PHINode"); + PN->replaceAllUsesWith(R.Remat); + PN->eraseFromParent(); + continue; } + R.Use->replaceUsesOfWith(R.Def, R.Remat); } } @@ -2407,10 +2589,7 @@ static void eliminateSwiftErrorArgument(Function &F, Argument &Arg, IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); auto ArgTy = cast<PointerType>(Arg.getType()); - // swifterror arguments are required to have pointer-to-pointer type, - // so create a pointer-typed alloca with opaque pointers. - auto ValueTy = ArgTy->isOpaque() ? PointerType::getUnqual(F.getContext()) - : ArgTy->getNonOpaquePointerElementType(); + auto ValueTy = PointerType::getUnqual(F.getContext()); // Reduce to the alloca case: @@ -2523,6 +2702,9 @@ static void sinkSpillUsesAfterCoroBegin(Function &F, /// hence minimizing the amount of data we end up putting on the frame. static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape, SuspendCrossingInfo &Checker) { + if (F.hasOptNone()) + return; + DominatorTree DT(F); // Collect all possible basic blocks which may dominate all uses of allocas. @@ -2635,7 +2817,7 @@ static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape, } void coro::salvageDebugInfo( - SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache, + SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, DbgVariableIntrinsic *DVI, bool OptimizeFrame) { Function *F = DVI->getFunction(); IRBuilder<> Builder(F->getContext()); @@ -2652,7 +2834,7 @@ void coro::salvageDebugInfo( while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) { if (auto *LdInst = dyn_cast<LoadInst>(Inst)) { - Storage = LdInst->getOperand(0); + Storage = LdInst->getPointerOperand(); // FIXME: This is a heuristic that works around the fact that // LLVM IR debug intrinsics cannot yet distinguish between // memory and value locations: Because a dbg.declare(alloca) is @@ -2662,7 +2844,7 @@ void coro::salvageDebugInfo( if (!SkipOutermostLoad) Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); } else if (auto *StInst = dyn_cast<StoreInst>(Inst)) { - Storage = StInst->getOperand(0); + Storage = StInst->getValueOperand(); } else { SmallVector<uint64_t, 16> Ops; SmallVector<Value *, 0> AdditionalValues; @@ -2682,38 +2864,44 @@ void coro::salvageDebugInfo( if (!Storage) return; - // Store a pointer to the coroutine frame object in an alloca so it - // is available throughout the function when producing unoptimized - // code. Extending the lifetime this way is correct because the - // variable has been declared by a dbg.declare intrinsic. - // - // Avoid to create the alloca would be eliminated by optimization - // passes and the corresponding dbg.declares would be invalid. - if (!OptimizeFrame) - if (auto *Arg = dyn_cast<llvm::Argument>(Storage)) { - auto &Cached = DbgPtrAllocaCache[Storage]; - if (!Cached) { - Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr, - Arg->getName() + ".debug"); - Builder.CreateStore(Storage, Cached); - } - Storage = Cached; - // FIXME: LLVM lacks nuanced semantics to differentiate between - // memory and direct locations at the IR level. The backend will - // turn a dbg.declare(alloca, ..., DIExpression()) into a memory - // location. Thus, if there are deref and offset operations in the - // expression, we need to add a DW_OP_deref at the *start* of the - // expression to first load the contents of the alloca before - // adjusting it with the expression. - Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); + auto *StorageAsArg = dyn_cast<Argument>(Storage); + const bool IsSwiftAsyncArg = + StorageAsArg && StorageAsArg->hasAttribute(Attribute::SwiftAsync); + + // Swift async arguments are described by an entry value of the ABI-defined + // register containing the coroutine context. + if (IsSwiftAsyncArg && !Expr->isEntryValue()) + Expr = DIExpression::prepend(Expr, DIExpression::EntryValue); + + // If the coroutine frame is an Argument, store it in an alloca to improve + // its availability (e.g. registers may be clobbered). + // Avoid this if optimizations are enabled (they would remove the alloca) or + // if the value is guaranteed to be available through other means (e.g. swift + // ABI guarantees). + if (StorageAsArg && !OptimizeFrame && !IsSwiftAsyncArg) { + auto &Cached = ArgToAllocaMap[StorageAsArg]; + if (!Cached) { + Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr, + Storage->getName() + ".debug"); + Builder.CreateStore(Storage, Cached); } + Storage = Cached; + // FIXME: LLVM lacks nuanced semantics to differentiate between + // memory and direct locations at the IR level. The backend will + // turn a dbg.declare(alloca, ..., DIExpression()) into a memory + // location. Thus, if there are deref and offset operations in the + // expression, we need to add a DW_OP_deref at the *start* of the + // expression to first load the contents of the alloca before + // adjusting it with the expression. + Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore); + } DVI->replaceVariableLocationOp(OriginalStorage, Storage); DVI->setExpression(Expr); // We only hoist dbg.declare today since it doesn't make sense to hoist - // dbg.value or dbg.addr since they do not have the same function wide - // guarantees that dbg.declare does. - if (!isa<DbgValueInst>(DVI) && !isa<DbgAddrIntrinsic>(DVI)) { + // dbg.value since it does not have the same function wide guarantees that + // dbg.declare does. + if (isa<DbgDeclareInst>(DVI)) { Instruction *InsertPt = nullptr; if (auto *I = dyn_cast<Instruction>(Storage)) InsertPt = I->getInsertionPointAfterDef(); @@ -2724,7 +2912,71 @@ void coro::salvageDebugInfo( } } -void coro::buildCoroutineFrame(Function &F, Shape &Shape) { +static void doRematerializations( + Function &F, SuspendCrossingInfo &Checker, + const std::function<bool(Instruction &)> &MaterializableCallback) { + if (F.hasOptNone()) + return; + + SpillInfo Spills; + + // See if there are materializable instructions across suspend points + // We record these as the starting point to also identify materializable + // defs of uses in these operations + for (Instruction &I : instructions(F)) { + if (!MaterializableCallback(I)) + continue; + for (User *U : I.users()) + if (Checker.isDefinitionAcrossSuspend(I, U)) + Spills[&I].push_back(cast<Instruction>(U)); + } + + // Process each of the identified rematerializable instructions + // and add predecessor instructions that can also be rematerialized. + // This is actually a graph of instructions since we could potentially + // have multiple uses of a def in the set of predecessor instructions. + // The approach here is to maintain a graph of instructions for each bottom + // level instruction - where we have a unique set of instructions (nodes) + // and edges between them. We then walk the graph in reverse post-dominator + // order to insert them past the suspend point, but ensure that ordering is + // correct. We also rely on CSE removing duplicate defs for remats of + // different instructions with a def in common (rather than maintaining more + // complex graphs for each suspend point) + + // We can do this by adding new nodes to the list for each suspend + // point. Then using standard GraphTraits to give a reverse post-order + // traversal when we insert the nodes after the suspend + SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats; + for (auto &E : Spills) { + for (Instruction *U : E.second) { + // Don't process a user twice (this can happen if the instruction uses + // more than one rematerializable def) + if (AllRemats.count(U)) + continue; + + // Constructor creates the whole RematGraph for the given Use + auto RematUPtr = + std::make_unique<RematGraph>(MaterializableCallback, U, Checker); + + LLVM_DEBUG(dbgs() << "***** Next remat group *****\n"; + ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get()); + for (auto I = RPOT.begin(); I != RPOT.end(); + ++I) { (*I)->Node->dump(); } dbgs() + << "\n";); + + AllRemats[U] = std::move(RematUPtr); + } + } + + // Rewrite materializable instructions to be materialized at the use + // point. + LLVM_DEBUG(dumpRemats("Materializations", AllRemats)); + rewriteMaterializableInstructions(AllRemats); +} + +void coro::buildCoroutineFrame( + Function &F, Shape &Shape, + const std::function<bool(Instruction &)> &MaterializableCallback) { // Don't eliminate swifterror in async functions that won't be split. if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty()) eliminateSwiftError(F, Shape); @@ -2775,35 +3027,11 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { // Build suspend crossing info. SuspendCrossingInfo Checker(F, Shape); - IRBuilder<> Builder(F.getContext()); + doRematerializations(F, Checker, MaterializableCallback); + FrameDataInfo FrameData; SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas; SmallVector<Instruction*, 4> DeadInstructions; - - { - SpillInfo Spills; - for (int Repeat = 0; Repeat < 4; ++Repeat) { - // See if there are materializable instructions across suspend points. - // FIXME: We can use a worklist to track the possible materialize - // instructions instead of iterating the whole function again and again. - for (Instruction &I : instructions(F)) - if (materializable(I)) { - for (User *U : I.users()) - if (Checker.isDefinitionAcrossSuspend(I, U)) - Spills[&I].push_back(cast<Instruction>(U)); - } - - if (Spills.empty()) - break; - - // Rewrite materializable instructions to be materialized at the use - // point. - LLVM_DEBUG(dumpSpills("Materializations", Spills)); - rewriteMaterializableInstructions(Builder, Spills); - Spills.clear(); - } - } - if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && Shape.ABI != coro::ABI::RetconOnce) sinkLifetimeStartMarkers(F, Shape, Checker); |