src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-07-26 19:03:47 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-07-26 19:04:23 +0000
commit	7fa27ce4a07f19b07799a767fc29416f3b625afb (patch)
tree	27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Coroutines/CoroFrame.cpp
parent	e3b557809604d036af6e00c60f012c2025b59a5e (diff)

vendor/llvm-project/llvmorg-17-init-19304-gd0b54bb50e51

Diffstat (limited to 'llvm/lib/Transforms/Coroutines/CoroFrame.cpp')

-rw-r--r--

llvm/lib/Transforms/Coroutines/CoroFrame.cpp

608

1 files changed, 418 insertions, 190 deletions

diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index e98c601648e0..1f373270f951 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp

@@ -16,6 +16,7 @@

#include "CoroInternal.h"

#include "llvm/ADT/BitVector.h"

+#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/ScopeExit.h"

#include "llvm/ADT/SmallString.h"

#include "llvm/Analysis/PtrUseVisitor.h"

@@ -37,6 +38,7 @@

#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Transforms/Utils/PromoteMemToReg.h"

#include <algorithm>

+#include <deque>

#include <optional>

using namespace llvm;

@@ -87,7 +89,7 @@ public:

// crosses a suspend point.

namespace {

-struct SuspendCrossingInfo {

+class SuspendCrossingInfo {

BlockToIndexMapping Mapping;

struct BlockData {

@@ -96,20 +98,30 @@ struct SuspendCrossingInfo {

bool Suspend = false;

bool End = false;

bool KillLoop = false;

+ bool Changed = false;

};

SmallVector<BlockData, SmallVectorThreshold> Block;

- iterator_range<succ_iterator> successors(BlockData const &BD) const {

+ iterator_range<pred_iterator> predecessors(BlockData const &BD) const {

BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]);

- return llvm::successors(BB);

+ return llvm::predecessors(BB);

}

BlockData &getBlockData(BasicBlock *BB) {

return Block[Mapping.blockToIndex(BB)];

}

+ /// Compute the BlockData for the current function in one iteration.

+ /// Returns whether the BlockData changes in this iteration.

+ /// Initialize - Whether this is the first iteration, we can optimize

+ /// the initial case a little bit by manual loop switch.

+ template <bool Initialize = false> bool computeBlockData();

+public:

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

void dump() const;

void dump(StringRef Label, BitVector const &BV) const;

+#endif

SuspendCrossingInfo(Function &F, coro::Shape &Shape);

@@ -211,6 +223,72 @@ LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const {

}

#endif

+template <bool Initialize> bool SuspendCrossingInfo::computeBlockData() {

+ const size_t N = Mapping.size();

+ bool Changed = false;

+ for (size_t I = 0; I < N; ++I) {

+ auto &B = Block[I];

+ // We don't need to count the predecessors when initialization.

+ if constexpr (!Initialize)

+ // If all the predecessors of the current Block don't change,

+ // the BlockData for the current block must not change too.

+ if (all_of(predecessors(B), [this](BasicBlock *BB) {

+ return !Block[Mapping.blockToIndex(BB)].Changed;

+ })) {

+ B.Changed = false;

+ continue;

+ }

+ // Saved Consumes and Kills bitsets so that it is easy to see

+ // if anything changed after propagation.

+ auto SavedConsumes = B.Consumes;

+ auto SavedKills = B.Kills;

+ for (BasicBlock *PI : predecessors(B)) {

+ auto PrevNo = Mapping.blockToIndex(PI);

+ auto &P = Block[PrevNo];

+ // Propagate Kills and Consumes from predecessors into B.

+ B.Consumes |= P.Consumes;

+ B.Kills |= P.Kills;

+ // If block P is a suspend block, it should propagate kills into block

+ // B for every block P consumes.

+ if (P.Suspend)

+ B.Kills |= P.Consumes;

+ }

+ if (B.Suspend) {

+ // If block S is a suspend block, it should kill all of the blocks it

+ // consumes.

+ B.Kills |= B.Consumes;

+ } else if (B.End) {

+ // If block B is an end block, it should not propagate kills as the

+ // blocks following coro.end() are reached during initial invocation

+ // of the coroutine while all the data are still available on the

+ // stack or in the registers.

+ B.Kills.reset();

+ } else {

+ // This is reached when B block it not Suspend nor coro.end and it

+ // need to make sure that it is not in the kill set.

+ B.KillLoop |= B.Kills[I];

+ B.Kills.reset(I);

+ }

+ if constexpr (!Initialize) {

+ B.Changed = (B.Kills != SavedKills) || (B.Consumes != SavedConsumes);

+ Changed |= B.Changed;

+ }

+ if constexpr (Initialize)

+ return true;

+ return Changed;

SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)

: Mapping(F) {

const size_t N = Mapping.size();

@@ -222,6 +300,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)

B.Consumes.resize(N);

B.Kills.resize(N);

B.Consumes.set(I);

+ B.Changed = true;

}

// Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as

@@ -246,73 +325,123 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)

markSuspendBlock(Save);

}

- // Iterate propagating consumes and kills until they stop changing.

- int Iteration = 0;

- (void)Iteration;

+ computeBlockData</*Initialize=*/true>();

- bool Changed;

- do {

- LLVM_DEBUG(dbgs() << "iteration " << ++Iteration);

- LLVM_DEBUG(dbgs() << "==============\n");

- Changed = false;

- for (size_t I = 0; I < N; ++I) {

- auto &B = Block[I];

- for (BasicBlock *SI : successors(B)) {

- auto SuccNo = Mapping.blockToIndex(SI);

- // Saved Consumes and Kills bitsets so that it is easy to see

- // if anything changed after propagation.

- auto &S = Block[SuccNo];

- auto SavedConsumes = S.Consumes;

- auto SavedKills = S.Kills;

- // Propagate Kills and Consumes from block B into its successor S.

- S.Consumes |= B.Consumes;

- S.Kills |= B.Kills;

- // If block B is a suspend block, it should propagate kills into the

- // its successor for every block B consumes.

- if (B.Suspend) {

- S.Kills |= B.Consumes;

- }

- if (S.Suspend) {

- // If block S is a suspend block, it should kill all of the blocks it

- // consumes.

- S.Kills |= S.Consumes;

- } else if (S.End) {

- // If block S is an end block, it should not propagate kills as the

- // blocks following coro.end() are reached during initial invocation

- // of the coroutine while all the data are still available on the

- // stack or in the registers.

- S.Kills.reset();

- } else {

- // This is reached when S block it not Suspend nor coro.end and it

- // need to make sure that it is not in the kill set.

- S.KillLoop |= S.Kills[SuccNo];

- S.Kills.reset(SuccNo);

- }

+ while (computeBlockData())

+ ;

+ LLVM_DEBUG(dump());

- // See if anything changed.

- Changed |= (S.Kills != SavedKills) || (S.Consumes != SavedConsumes);

+namespace {

- if (S.Kills != SavedKills) {

- LLVM_DEBUG(dbgs() << "\nblock " << I << " follower " << SI->getName()

- << "\n");

- LLVM_DEBUG(dump("S.Kills", S.Kills));

- LLVM_DEBUG(dump("SavedKills", SavedKills));

- }

- if (S.Consumes != SavedConsumes) {

- LLVM_DEBUG(dbgs() << "\nblock " << I << " follower " << SI << "\n");

- LLVM_DEBUG(dump("S.Consume", S.Consumes));

- LLVM_DEBUG(dump("SavedCons", SavedConsumes));

+// RematGraph is used to construct a DAG for rematerializable instructions

+// When the constructor is invoked with a candidate instruction (which is

+// materializable) it builds a DAG of materializable instructions from that

+// point.

+// Typically, for each instruction identified as re-materializable across a

+// suspend point, a RematGraph will be created.

+struct RematGraph {

+ // Each RematNode in the graph contains the edges to instructions providing

+ // operands in the current node.

+ struct RematNode {

+ Instruction *Node;

+ SmallVector<RematNode *> Operands;

+ RematNode() = default;

+ RematNode(Instruction *V) : Node(V) {}

+ };

+ RematNode *EntryNode;

+ using RematNodeMap =

+ SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>;

+ RematNodeMap Remats;

+ const std::function<bool(Instruction &)> &MaterializableCallback;

+ SuspendCrossingInfo &Checker;

+ RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback,

+ Instruction *I, SuspendCrossingInfo &Checker)

+ : MaterializableCallback(MaterializableCallback), Checker(Checker) {

+ std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I);

+ EntryNode = FirstNode.get();

+ std::deque<std::unique_ptr<RematNode>> WorkList;

+ addNode(std::move(FirstNode), WorkList, cast<User>(I));

+ while (WorkList.size()) {

+ std::unique_ptr<RematNode> N = std::move(WorkList.front());

+ WorkList.pop_front();

+ addNode(std::move(N), WorkList, cast<User>(I));

+ }

+ void addNode(std::unique_ptr<RematNode> NUPtr,

+ std::deque<std::unique_ptr<RematNode>> &WorkList,

+ User *FirstUse) {

+ RematNode *N = NUPtr.get();

+ if (Remats.count(N->Node))

+ return;

+ // We haven't see this node yet - add to the list

+ Remats[N->Node] = std::move(NUPtr);

+ for (auto &Def : N->Node->operands()) {

+ Instruction *D = dyn_cast<Instruction>(Def.get());

+ if (!D || !MaterializableCallback(*D) ||

+ !Checker.isDefinitionAcrossSuspend(*D, FirstUse))

+ continue;

+ if (Remats.count(D)) {

+ // Already have this in the graph

+ N->Operands.push_back(Remats[D].get());

+ continue;

+ }

+ bool NoMatch = true;

+ for (auto &I : WorkList) {

+ if (I->Node == D) {

+ NoMatch = false;

+ N->Operands.push_back(I.get());

+ break;

}

+ if (NoMatch) {

+ // Create a new node

+ std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D);

+ N->Operands.push_back(ChildNode.get());

+ WorkList.push_back(std::move(ChildNode));

+ }

}

- } while (Changed);

- LLVM_DEBUG(dump());

+ }

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

+ void dump() const {

+ dbgs() << "Entry (";

+ if (EntryNode->Node->getParent()->hasName())

+ dbgs() << EntryNode->Node->getParent()->getName();

+ else

+ EntryNode->Node->getParent()->printAsOperand(dbgs(), false);

+ dbgs() << ") : " << *EntryNode->Node << "\n";

+ for (auto &E : Remats) {

+ dbgs() << *(E.first) << "\n";

+ for (RematNode *U : E.second->Operands)

+ dbgs() << " " << *U->Node << "\n";

+ }

+#endif

+};

+} // end anonymous namespace

+namespace llvm {

+template <> struct GraphTraits<RematGraph *> {

+ using NodeRef = RematGraph::RematNode *;

+ using ChildIteratorType = RematGraph::RematNode **;

+ static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; }

+ static ChildIteratorType child_begin(NodeRef N) {

+ return N->Operands.begin();

+ }

+ static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }

+};

+} // end namespace llvm

#undef DEBUG_TYPE // "coro-suspend-crossing"

#define DEBUG_TYPE "coro-frame"

@@ -425,6 +554,15 @@ static void dumpSpills(StringRef Title, const SpillInfo &Spills) {

I->dump();

}

+static void dumpRemats(

+ StringRef Title,

+ const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) {

+ dbgs() << "------------- " << Title << "--------------\n";

+ for (const auto &E : RM) {

+ E.second->dump();

+ dbgs() << "--\n";

+ }

static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) {

dbgs() << "------------- Allocas --------------\n";

@@ -637,10 +775,10 @@ void FrameTypeBuilder::addFieldForAllocas(const Function &F,

return;

}

- // Because there are pathes from the lifetime.start to coro.end

+ // Because there are paths from the lifetime.start to coro.end

// for each alloca, the liferanges for every alloca is overlaped

// in the blocks who contain coro.end and the successor blocks.

- // So we choose to skip there blocks when we calculates the liferange

+ // So we choose to skip there blocks when we calculate the liferange

// for each alloca. It should be reasonable since there shouldn't be uses

// in these blocks and the coroutine frame shouldn't be used outside the

// coroutine body.

@@ -820,7 +958,7 @@ void FrameTypeBuilder::finish(StructType *Ty) {

static void cacheDIVar(FrameDataInfo &FrameData,

DenseMap<Value *, DILocalVariable *> &DIVarCache) {

for (auto *V : FrameData.getAllDefs()) {

- if (DIVarCache.find(V) != DIVarCache.end())

+ if (DIVarCache.contains(V))

continue;

auto DDIs = FindDbgDeclareUses(V);

@@ -852,18 +990,8 @@ static StringRef solveTypeName(Type *Ty) {

return "__floating_type_";

}

- if (auto *PtrTy = dyn_cast<PointerType>(Ty)) {

- if (PtrTy->isOpaque())

- return "PointerType";

- Type *PointeeTy = PtrTy->getNonOpaquePointerElementType();

- auto Name = solveTypeName(PointeeTy);

- if (Name == "UnknownType")

- return "PointerType";

- SmallString<16> Buffer;

- Twine(Name + "_Ptr").toStringRef(Buffer);

- auto *MDName = MDString::get(Ty->getContext(), Buffer.str());

- return MDName->getString();

- }

+ if (Ty->isPointerTy())

+ return "PointerType";

if (Ty->isStructTy()) {

if (!cast<StructType>(Ty)->hasName())

@@ -1043,7 +1171,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,

dwarf::DW_ATE_unsigned_char)});

for (auto *V : FrameData.getAllDefs()) {

- if (DIVarCache.find(V) == DIVarCache.end())

+ if (!DIVarCache.contains(V))

continue;

auto Index = FrameData.getFieldIndex(V);

@@ -1075,7 +1203,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,

// fields confilicts with each other.

unsigned UnknownTypeNum = 0;

for (unsigned Index = 0; Index < FrameTy->getNumElements(); Index++) {

- if (OffsetCache.find(Index) == OffsetCache.end())

+ if (!OffsetCache.contains(Index))

continue;

std::string Name;

@@ -1090,7 +1218,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,

AlignInBits = OffsetCache[Index].first * 8;

OffsetInBits = OffsetCache[Index].second * 8;

- if (NameCache.find(Index) != NameCache.end()) {

+ if (NameCache.contains(Index)) {

Name = NameCache[Index].str();

DITy = TyCache[Index];

} else {

@@ -1282,7 +1410,7 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,

// function call or any of the memory intrinsics, we check whether this

// instruction is prior to CoroBegin. To answer question 3, we track the offsets

// of all aliases created for the alloca prior to CoroBegin but used after

-// CoroBegin. llvm::Optional is used to be able to represent the case when the

+// CoroBegin. std::optional is used to be able to represent the case when the

// offset is unknown (e.g. when you have a PHINode that takes in different

// offset values). We cannot handle unknown offsets and will assert. This is the

// potential issue left out. An ideal solution would likely require a

@@ -1586,11 +1714,12 @@ static void createFramePtr(coro::Shape &Shape) {

static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {

auto *CB = Shape.CoroBegin;

LLVMContext &C = CB->getContext();

+ Function *F = CB->getFunction();

IRBuilder<> Builder(C);

StructType *FrameTy = Shape.FrameTy;

Value *FramePtr = Shape.FramePtr;

- DominatorTree DT(*CB->getFunction());

- SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> DbgPtrAllocaCache;

+ DominatorTree DT(*F);

+ SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap;

// Create a GEP with the given index into the coroutine frame for the original

// value Orig. Appends an extra 0 index for array-allocas, preserving the

@@ -1723,6 +1852,21 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {

SpillAlignment, E.first->getName() + Twine(".reload"));

TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(Def);

+ // Try best to find dbg.declare. If the spill is a temp, there may not

+ // be a direct dbg.declare. Walk up the load chain to find one from an

+ // alias.

+ if (F->getSubprogram()) {

+ auto *CurDef = Def;

+ while (DIs.empty() && isa<LoadInst>(CurDef)) {

+ auto *LdInst = cast<LoadInst>(CurDef);

+ // Only consider ptr to ptr same type load.

+ if (LdInst->getPointerOperandType() != LdInst->getType())

+ break;

+ CurDef = LdInst->getPointerOperand();

+ DIs = FindDbgDeclareUses(CurDef);

+ }

for (DbgDeclareInst *DDI : DIs) {

bool AllowUnresolved = false;

// This dbg.declare is preserved for all coro-split function

@@ -1734,16 +1878,10 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {

&*Builder.GetInsertPoint());

// This dbg.declare is for the main function entry point. It

// will be deleted in all coro-split functions.

- coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.OptimizeFrame);

+ coro::salvageDebugInfo(ArgToAllocaMap, DDI, Shape.OptimizeFrame);

}

- // Salvage debug info on any dbg.addr that we see. We do not insert them

- // into each block where we have a use though.

- if (auto *DI = dyn_cast<DbgAddrIntrinsic>(U)) {

- coro::salvageDebugInfo(DbgPtrAllocaCache, DI, Shape.OptimizeFrame);

- }

// If we have a single edge PHINode, remove it and replace it with a

// reload from the coroutine frame. (We already took care of multi edge

// PHINodes by rewriting them in the rewritePHIs function).

@@ -1813,11 +1951,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {

DVI->replaceUsesOfWith(Alloca, G);

for (Instruction *I : UsersToUpdate) {

- // It is meaningless to remain the lifetime intrinsics refer for the

+ // It is meaningless to retain the lifetime intrinsics refer for the

// member of coroutine frames and the meaningless lifetime intrinsics

// are possible to block further optimizations.

- if (I->isLifetimeStartOrEnd())

+ if (I->isLifetimeStartOrEnd()) {

+ I->eraseFromParent();

continue;

+ }

I->replaceUsesOfWith(Alloca, G);

}

@@ -2089,11 +2229,12 @@ static void rewritePHIs(Function &F) {

rewritePHIs(*BB);

}

+/// Default materializable callback

// Check for instructions that we can recreate on resume as opposed to spill

// the result into a coroutine frame.

-static bool materializable(Instruction &V) {

- return isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) ||

- isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V);

+bool coro::defaultMaterializable(Instruction &V) {

+ return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) ||

+ isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V));

}

// Check for structural coroutine intrinsics that should not be spilled into

@@ -2103,41 +2244,82 @@ static bool isCoroutineStructureIntrinsic(Instruction &I) {

isa<CoroSuspendInst>(&I);

}

-// For every use of the value that is across suspend point, recreate that value

-// after a suspend point.

-static void rewriteMaterializableInstructions(IRBuilder<> &IRB,

- const SpillInfo &Spills) {

- for (const auto &E : Spills) {

- Value *Def = E.first;

- BasicBlock *CurrentBlock = nullptr;

+// For each instruction identified as materializable across the suspend point,

+// and its associated DAG of other rematerializable instructions,

+// recreate the DAG of instructions after the suspend point.

+static void rewriteMaterializableInstructions(

+ const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8>

+ &AllRemats) {

+ // This has to be done in 2 phases

+ // Do the remats and record the required defs to be replaced in the

+ // original use instructions

+ // Once all the remats are complete, replace the uses in the final

+ // instructions with the new defs

+ typedef struct {

+ Instruction *Use;

+ Instruction *Def;

+ Instruction *Remat;

+ } ProcessNode;

+ SmallVector<ProcessNode> FinalInstructionsToProcess;

+ for (const auto &E : AllRemats) {

+ Instruction *Use = E.first;

Instruction *CurrentMaterialization = nullptr;

- for (Instruction *U : E.second) {

- // If we have not seen this block, materialize the value.

- if (CurrentBlock != U->getParent()) {

+ RematGraph *RG = E.second.get();

+ ReversePostOrderTraversal<RematGraph *> RPOT(RG);

+ SmallVector<Instruction *> InstructionsToProcess;

+ // If the target use is actually a suspend instruction then we have to

+ // insert the remats into the end of the predecessor (there should only be

+ // one). This is so that suspend blocks always have the suspend instruction

+ // as the first instruction.

+ auto InsertPoint = &*Use->getParent()->getFirstInsertionPt();

+ if (isa<AnyCoroSuspendInst>(Use)) {

+ BasicBlock *SuspendPredecessorBlock =

+ Use->getParent()->getSinglePredecessor();

+ assert(SuspendPredecessorBlock && "malformed coro suspend instruction");

+ InsertPoint = SuspendPredecessorBlock->getTerminator();

+ }

- bool IsInCoroSuspendBlock = isa<AnyCoroSuspendInst>(U);

- CurrentBlock = U->getParent();

- auto *InsertBlock = IsInCoroSuspendBlock

- ? CurrentBlock->getSinglePredecessor()

- : CurrentBlock;

- CurrentMaterialization = cast<Instruction>(Def)->clone();

- CurrentMaterialization->setName(Def->getName());

- CurrentMaterialization->insertBefore(

- IsInCoroSuspendBlock ? InsertBlock->getTerminator()

- : &*InsertBlock->getFirstInsertionPt());

- }

- if (auto *PN = dyn_cast<PHINode>(U)) {

- assert(PN->getNumIncomingValues() == 1 &&

- "unexpected number of incoming "

- "values in the PHINode");

- PN->replaceAllUsesWith(CurrentMaterialization);

- PN->eraseFromParent();

- continue;

- }

- // Replace all uses of Def in the current instruction with the

- // CurrentMaterialization for the block.

- U->replaceUsesOfWith(Def, CurrentMaterialization);

+ // Note: skip the first instruction as this is the actual use that we're

+ // rematerializing everything for.

+ auto I = RPOT.begin();

+ ++I;

+ for (; I != RPOT.end(); ++I) {

+ Instruction *D = (*I)->Node;

+ CurrentMaterialization = D->clone();

+ CurrentMaterialization->setName(D->getName());

+ CurrentMaterialization->insertBefore(InsertPoint);

+ InsertPoint = CurrentMaterialization;

+ // Replace all uses of Def in the instructions being added as part of this

+ // rematerialization group

+ for (auto &I : InstructionsToProcess)

+ I->replaceUsesOfWith(D, CurrentMaterialization);

+ // Don't replace the final use at this point as this can cause problems

+ // for other materializations. Instead, for any final use that uses a

+ // define that's being rematerialized, record the replace values

+ for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i)

+ if (Use->getOperand(i) == D) // Is this operand pointing to oldval?

+ FinalInstructionsToProcess.push_back(

+ {Use, D, CurrentMaterialization});

+ InstructionsToProcess.push_back(CurrentMaterialization);

+ }

+ // Finally, replace the uses with the defines that we've just rematerialized

+ for (auto &R : FinalInstructionsToProcess) {

+ if (auto *PN = dyn_cast<PHINode>(R.Use)) {

+ assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "

+ "values in the PHINode");

+ PN->replaceAllUsesWith(R.Remat);

+ PN->eraseFromParent();

+ continue;

}

+ R.Use->replaceUsesOfWith(R.Def, R.Remat);

}

@@ -2407,10 +2589,7 @@ static void eliminateSwiftErrorArgument(Function &F, Argument &Arg,

IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());

auto ArgTy = cast<PointerType>(Arg.getType());

- // swifterror arguments are required to have pointer-to-pointer type,

- // so create a pointer-typed alloca with opaque pointers.

- auto ValueTy = ArgTy->isOpaque() ? PointerType::getUnqual(F.getContext())

- : ArgTy->getNonOpaquePointerElementType();

+ auto ValueTy = PointerType::getUnqual(F.getContext());

// Reduce to the alloca case:

@@ -2523,6 +2702,9 @@ static void sinkSpillUsesAfterCoroBegin(Function &F,

/// hence minimizing the amount of data we end up putting on the frame.

static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,

SuspendCrossingInfo &Checker) {

+ if (F.hasOptNone())

+ return;

DominatorTree DT(F);

// Collect all possible basic blocks which may dominate all uses of allocas.

@@ -2635,7 +2817,7 @@ static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape,

}

void coro::salvageDebugInfo(

- SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache,

+ SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,

DbgVariableIntrinsic *DVI, bool OptimizeFrame) {

Function *F = DVI->getFunction();

IRBuilder<> Builder(F->getContext());

@@ -2652,7 +2834,7 @@ void coro::salvageDebugInfo(

while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) {

if (auto *LdInst = dyn_cast<LoadInst>(Inst)) {

- Storage = LdInst->getOperand(0);

+ Storage = LdInst->getPointerOperand();

// FIXME: This is a heuristic that works around the fact that

// LLVM IR debug intrinsics cannot yet distinguish between

// memory and value locations: Because a dbg.declare(alloca) is

@@ -2662,7 +2844,7 @@ void coro::salvageDebugInfo(

if (!SkipOutermostLoad)

Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);

} else if (auto *StInst = dyn_cast<StoreInst>(Inst)) {

- Storage = StInst->getOperand(0);

+ Storage = StInst->getValueOperand();

} else {

SmallVector<uint64_t, 16> Ops;

SmallVector<Value *, 0> AdditionalValues;

@@ -2682,38 +2864,44 @@ void coro::salvageDebugInfo(

if (!Storage)

return;

- // Store a pointer to the coroutine frame object in an alloca so it

- // is available throughout the function when producing unoptimized

- // code. Extending the lifetime this way is correct because the

- // variable has been declared by a dbg.declare intrinsic.

- //

- // Avoid to create the alloca would be eliminated by optimization

- // passes and the corresponding dbg.declares would be invalid.

- if (!OptimizeFrame)

- if (auto *Arg = dyn_cast<llvm::Argument>(Storage)) {

- auto &Cached = DbgPtrAllocaCache[Storage];

- if (!Cached) {

- Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr,

- Arg->getName() + ".debug");

- Builder.CreateStore(Storage, Cached);

- }

- Storage = Cached;

- // FIXME: LLVM lacks nuanced semantics to differentiate between

- // memory and direct locations at the IR level. The backend will

- // turn a dbg.declare(alloca, ..., DIExpression()) into a memory

- // location. Thus, if there are deref and offset operations in the

- // expression, we need to add a DW_OP_deref at the *start* of the

- // expression to first load the contents of the alloca before

- // adjusting it with the expression.

- Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);

+ auto *StorageAsArg = dyn_cast<Argument>(Storage);

+ const bool IsSwiftAsyncArg =

+ StorageAsArg && StorageAsArg->hasAttribute(Attribute::SwiftAsync);

+ // Swift async arguments are described by an entry value of the ABI-defined

+ // register containing the coroutine context.

+ if (IsSwiftAsyncArg && !Expr->isEntryValue())

+ Expr = DIExpression::prepend(Expr, DIExpression::EntryValue);

+ // If the coroutine frame is an Argument, store it in an alloca to improve

+ // its availability (e.g. registers may be clobbered).

+ // Avoid this if optimizations are enabled (they would remove the alloca) or

+ // if the value is guaranteed to be available through other means (e.g. swift

+ // ABI guarantees).

+ if (StorageAsArg && !OptimizeFrame && !IsSwiftAsyncArg) {

+ auto &Cached = ArgToAllocaMap[StorageAsArg];

+ if (!Cached) {

+ Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr,

+ Storage->getName() + ".debug");

+ Builder.CreateStore(Storage, Cached);

}

+ Storage = Cached;

+ // FIXME: LLVM lacks nuanced semantics to differentiate between

+ // memory and direct locations at the IR level. The backend will

+ // turn a dbg.declare(alloca, ..., DIExpression()) into a memory

+ // location. Thus, if there are deref and offset operations in the

+ // expression, we need to add a DW_OP_deref at the *start* of the

+ // expression to first load the contents of the alloca before

+ // adjusting it with the expression.

+ Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);

+ }

DVI->replaceVariableLocationOp(OriginalStorage, Storage);

DVI->setExpression(Expr);

// We only hoist dbg.declare today since it doesn't make sense to hoist

- // dbg.value or dbg.addr since they do not have the same function wide

- // guarantees that dbg.declare does.

- if (!isa<DbgValueInst>(DVI) && !isa<DbgAddrIntrinsic>(DVI)) {

+ // dbg.value since it does not have the same function wide guarantees that

+ // dbg.declare does.

+ if (isa<DbgDeclareInst>(DVI)) {

Instruction *InsertPt = nullptr;

if (auto *I = dyn_cast<Instruction>(Storage))

InsertPt = I->getInsertionPointAfterDef();

@@ -2724,7 +2912,71 @@ void coro::salvageDebugInfo(

}

-void coro::buildCoroutineFrame(Function &F, Shape &Shape) {

+static void doRematerializations(

+ Function &F, SuspendCrossingInfo &Checker,

+ const std::function<bool(Instruction &)> &MaterializableCallback) {

+ if (F.hasOptNone())

+ return;

+ SpillInfo Spills;

+ // See if there are materializable instructions across suspend points

+ // We record these as the starting point to also identify materializable

+ // defs of uses in these operations

+ for (Instruction &I : instructions(F)) {

+ if (!MaterializableCallback(I))

+ continue;

+ for (User *U : I.users())

+ if (Checker.isDefinitionAcrossSuspend(I, U))

+ Spills[&I].push_back(cast<Instruction>(U));

+ }

+ // Process each of the identified rematerializable instructions

+ // and add predecessor instructions that can also be rematerialized.

+ // This is actually a graph of instructions since we could potentially

+ // have multiple uses of a def in the set of predecessor instructions.

+ // The approach here is to maintain a graph of instructions for each bottom

+ // level instruction - where we have a unique set of instructions (nodes)

+ // and edges between them. We then walk the graph in reverse post-dominator

+ // order to insert them past the suspend point, but ensure that ordering is

+ // correct. We also rely on CSE removing duplicate defs for remats of

+ // different instructions with a def in common (rather than maintaining more

+ // complex graphs for each suspend point)

+ // We can do this by adding new nodes to the list for each suspend

+ // point. Then using standard GraphTraits to give a reverse post-order

+ // traversal when we insert the nodes after the suspend

+ SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats;

+ for (auto &E : Spills) {

+ for (Instruction *U : E.second) {

+ // Don't process a user twice (this can happen if the instruction uses

+ // more than one rematerializable def)

+ if (AllRemats.count(U))

+ continue;

+ // Constructor creates the whole RematGraph for the given Use

+ auto RematUPtr =

+ std::make_unique<RematGraph>(MaterializableCallback, U, Checker);

+ LLVM_DEBUG(dbgs() << "***** Next remat group *****\n";

+ ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get());

+ for (auto I = RPOT.begin(); I != RPOT.end();

+ ++I) { (*I)->Node->dump(); } dbgs()

+ << "\n";);

+ AllRemats[U] = std::move(RematUPtr);

+ }

+ // Rewrite materializable instructions to be materialized at the use

+ // point.

+ LLVM_DEBUG(dumpRemats("Materializations", AllRemats));

+ rewriteMaterializableInstructions(AllRemats);

+void coro::buildCoroutineFrame(

+ Function &F, Shape &Shape,

+ const std::function<bool(Instruction &)> &MaterializableCallback) {

// Don't eliminate swifterror in async functions that won't be split.

if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty())

eliminateSwiftError(F, Shape);

@@ -2775,35 +3027,11 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {

// Build suspend crossing info.

SuspendCrossingInfo Checker(F, Shape);

- IRBuilder<> Builder(F.getContext());

+ doRematerializations(F, Checker, MaterializableCallback);

FrameDataInfo FrameData;

SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas;

SmallVector<Instruction*, 4> DeadInstructions;

- {

- SpillInfo Spills;

- for (int Repeat = 0; Repeat < 4; ++Repeat) {

- // See if there are materializable instructions across suspend points.

- // FIXME: We can use a worklist to track the possible materialize

- // instructions instead of iterating the whole function again and again.

- for (Instruction &I : instructions(F))

- if (materializable(I)) {

- for (User *U : I.users())

- if (Checker.isDefinitionAcrossSuspend(I, U))

- Spills[&I].push_back(cast<Instruction>(U));

- }

- if (Spills.empty())

- break;

- // Rewrite materializable instructions to be materialized at the use

- // point.

- LLVM_DEBUG(dumpSpills("Materializations", Spills));

- rewriteMaterializableInstructions(Builder, Spills);

- Spills.clear();

- }

if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&

Shape.ABI != coro::ABI::RetconOnce)

sinkLifetimeStartMarkers(F, Shape, Checker);