8 files changed, 928 insertions, 277 deletions
diff --git a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index c2dbd6f41642..233eae37c497 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// This pass lowers all remaining coroutine intrinsics.
-//===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Coroutines/CoroCleanup.h"
 #include "CoroInternal.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -90,12 +89,26 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
     // After replacement were made we can cleanup the function body a little.
     simplifyCFG(F);
   }
+
   return Changed;
 }
 
-//===----------------------------------------------------------------------===//
-//                              Top Level Driver
-//===----------------------------------------------------------------------===//
+static bool declaresCoroCleanupIntrinsics(const Module &M) {
+  return coro::declaresIntrinsics(M, {"llvm.coro.alloc", "llvm.coro.begin",
+                                      "llvm.coro.subfn.addr", "llvm.coro.free",
+                                      "llvm.coro.id", "llvm.coro.id.retcon",
+                                      "llvm.coro.id.retcon.once"});
+}
+
+PreservedAnalyses CoroCleanupPass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  auto &M = *F.getParent();
+  if (!declaresCoroCleanupIntrinsics(M) ||
+      !Lowerer(M).lowerRemainingCoroIntrinsics(F))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
 
 namespace {
 
@@ -111,10 +124,7 @@ struct CoroCleanupLegacy : FunctionPass {
   // This pass has work to do only if we find intrinsics we are going to lower
   // in the module.
   bool doInitialization(Module &M) override {
-    if (coro::declaresIntrinsics(M, {"llvm.coro.alloc", "llvm.coro.begin",
-                                     "llvm.coro.subfn.addr", "llvm.coro.free",
-                                     "llvm.coro.id", "llvm.coro.id.retcon",
-                                     "llvm.coro.id.retcon.once"}))
+    if (declaresCoroCleanupIntrinsics(M))
       L = std::make_unique<Lowerer>(M);
     return false;
   }
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index e73fb9eeb1e9..242e6c3f6b23 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -5,13 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// This pass lowers coroutine intrinsics that hide the details of the exact
-// calling convention for coroutine resume and destroy functions and details of
-// the structure of the coroutine frame.
-//===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Coroutines/CoroEarly.h"
 #include "CoroInternal.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
@@ -28,7 +24,7 @@ class Lowerer : public coro::LowererBase {
   PointerType *const AnyResumeFnPtrTy;
   Constant *NoopCoro = nullptr;
 
-  void lowerResumeOrDestroy(CallSite CS, CoroSubFnInst::ResumeKind);
+  void lowerResumeOrDestroy(CallBase &CB, CoroSubFnInst::ResumeKind);
   void lowerCoroPromise(CoroPromiseInst *Intrin);
   void lowerCoroDone(IntrinsicInst *II);
   void lowerCoroNoop(IntrinsicInst *II);
@@ -47,12 +43,11 @@ public:
 // an address returned by coro.subfn.addr intrinsic. This is done so that
 // CGPassManager recognizes devirtualization when CoroElide pass replaces a call
 // to coro.subfn.addr with an appropriate function address.
-void Lowerer::lowerResumeOrDestroy(CallSite CS,
+void Lowerer::lowerResumeOrDestroy(CallBase &CB,
                                    CoroSubFnInst::ResumeKind Index) {
-  Value *ResumeAddr =
-      makeSubFnCall(CS.getArgOperand(0), Index, CS.getInstruction());
-  CS.setCalledFunction(ResumeAddr);
-  CS.setCallingConv(CallingConv::Fast);
+  Value *ResumeAddr = makeSubFnCall(CB.getArgOperand(0), Index, &CB);
+  CB.setCalledOperand(ResumeAddr);
+  CB.setCallingConv(CallingConv::Fast);
 }
 
 // Coroutine promise field is always at the fixed offset from the beginning of
@@ -64,14 +59,14 @@ void Lowerer::lowerResumeOrDestroy(CallSite CS,
 // TODO: Handle the case when coroutine promise alloca has align override.
 void Lowerer::lowerCoroPromise(CoroPromiseInst *Intrin) {
   Value *Operand = Intrin->getArgOperand(0);
-  unsigned Alignement = Intrin->getAlignment();
+  Align Alignment = Intrin->getAlignment();
   Type *Int8Ty = Builder.getInt8Ty();
 
   auto *SampleStruct =
       StructType::get(Context, {AnyResumeFnPtrTy, AnyResumeFnPtrTy, Int8Ty});
   const DataLayout &DL = TheModule.getDataLayout();
   int64_t Offset = alignTo(
-      DL.getStructLayout(SampleStruct)->getElementOffset(2), Alignement);
+      DL.getStructLayout(SampleStruct)->getElementOffset(2), Alignment);
   if (Intrin->isFromPromise())
     Offset = -Offset;
 
@@ -98,7 +93,7 @@ void Lowerer::lowerCoroDone(IntrinsicInst *II) {
 
   Builder.SetInsertPoint(II);
   auto *BCI = Builder.CreateBitCast(Operand, FramePtrTy);
-  auto *Load = Builder.CreateLoad(BCI);
+  auto *Load = Builder.CreateLoad(FrameTy, BCI);
   auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
 
   II->replaceAllUsesWith(Cond);
@@ -156,8 +151,8 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
   SmallVector<CoroFreeInst *, 4> CoroFrees;
   for (auto IB = inst_begin(F), IE = inst_end(F); IB != IE;) {
     Instruction &I = *IB++;
-    if (auto CS = CallSite(&I)) {
-      switch (CS.getIntrinsicID()) {
+    if (auto *CB = dyn_cast<CallBase>(&I)) {
+      switch (CB->getIntrinsicID()) {
       default:
         continue;
       case Intrinsic::coro_free:
@@ -167,13 +162,13 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
         // Make sure that final suspend point is not duplicated as CoroSplit
         // pass expects that there is at most one final suspend point.
         if (cast<CoroSuspendInst>(&I)->isFinal())
-          CS.setCannotDuplicate();
+          CB->setCannotDuplicate();
         break;
       case Intrinsic::coro_end:
         // Make sure that fallthrough coro.end is not duplicated as CoroSplit
         // pass expects that there is at most one fallthrough coro.end.
         if (cast<CoroEndInst>(&I)->isFallthrough())
-          CS.setCannotDuplicate();
+          CB->setCannotDuplicate();
         break;
       case Intrinsic::coro_noop:
         lowerCoroNoop(cast<IntrinsicInst>(&I));
@@ -195,10 +190,10 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
         F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
         break;
       case Intrinsic::coro_resume:
-        lowerResumeOrDestroy(CS, CoroSubFnInst::ResumeIndex);
+        lowerResumeOrDestroy(*CB, CoroSubFnInst::ResumeIndex);
         break;
       case Intrinsic::coro_destroy:
-        lowerResumeOrDestroy(CS, CoroSubFnInst::DestroyIndex);
+        lowerResumeOrDestroy(*CB, CoroSubFnInst::DestroyIndex);
         break;
       case Intrinsic::coro_promise:
         lowerCoroPromise(cast<CoroPromiseInst>(&I));
@@ -219,9 +214,23 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
   return Changed;
 }
 
-//===----------------------------------------------------------------------===//
-//                              Top Level Driver
-//===----------------------------------------------------------------------===//
+static bool declaresCoroEarlyIntrinsics(const Module &M) {
+  return coro::declaresIntrinsics(
+      M, {"llvm.coro.id", "llvm.coro.id.retcon", "llvm.coro.id.retcon.once",
+          "llvm.coro.destroy", "llvm.coro.done", "llvm.coro.end",
+          "llvm.coro.noop", "llvm.coro.free", "llvm.coro.promise",
+          "llvm.coro.resume", "llvm.coro.suspend"});
+}
+
+PreservedAnalyses CoroEarlyPass::run(Function &F, FunctionAnalysisManager &) {
+  Module &M = *F.getParent();
+  if (!declaresCoroEarlyIntrinsics(M) || !Lowerer(M).lowerEarlyIntrinsics(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
 
 namespace {
 
@@ -236,17 +245,7 @@ struct CoroEarlyLegacy : public FunctionPass {
   // This pass has work to do only if we find intrinsics we are going to lower
   // in the module.
   bool doInitialization(Module &M) override {
-    if (coro::declaresIntrinsics(M, {"llvm.coro.id",
-                                     "llvm.coro.id.retcon",
-                                     "llvm.coro.id.retcon.once",
-                                     "llvm.coro.destroy",
-                                     "llvm.coro.done",
-                                     "llvm.coro.end",
-                                     "llvm.coro.noop", 
-                                     "llvm.coro.free",
-                                     "llvm.coro.promise",
-                                     "llvm.coro.resume",
-                                     "llvm.coro.suspend"}))
+    if (declaresCoroEarlyIntrinsics(M))
       L = std::make_unique<Lowerer>(M);
     return false;
   }
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index 23d22e23861a..9d364b3097c1 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -5,12 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// This pass replaces dynamic allocation of coroutine frame with alloca and
-// replaces calls to llvm.coro.resume and llvm.coro.destroy with direct calls
-// to coroutine sub-functions.
-//===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Coroutines/CoroElide.h"
 #include "CoroInternal.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/Dominators.h"
@@ -30,14 +28,19 @@ struct Lowerer : coro::LowererBase {
   SmallVector<CoroBeginInst *, 1> CoroBegins;
   SmallVector<CoroAllocInst *, 1> CoroAllocs;
   SmallVector<CoroSubFnInst *, 4> ResumeAddr;
-  SmallVector<CoroSubFnInst *, 4> DestroyAddr;
+  DenseMap<CoroBeginInst *, SmallVector<CoroSubFnInst *, 4>> DestroyAddr;
   SmallVector<CoroFreeInst *, 1> CoroFrees;
+  SmallPtrSet<const SwitchInst *, 4> CoroSuspendSwitches;
 
   Lowerer(Module &M) : LowererBase(M) {}
 
-  void elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA);
+  void elideHeapAllocations(Function *F, uint64_t FrameSize, Align FrameAlign,
+                            AAResults &AA);
   bool shouldElide(Function *F, DominatorTree &DT) const;
+  void collectPostSplitCoroIds(Function *F);
   bool processCoroId(CoroIdInst *, AAResults &AA, DominatorTree &DT);
+  bool hasEscapePath(const CoroBeginInst *,
+                     const SmallPtrSetImpl<BasicBlock *> &) const;
 };
 } // end anonymous namespace
 
@@ -90,10 +93,23 @@ static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
       }
 }
 
-// Given a resume function @f.resume(%f.frame* %frame), returns %f.frame type.
-static Type *getFrameType(Function *Resume) {
-  auto *ArgType = Resume->arg_begin()->getType();
-  return cast<PointerType>(ArgType)->getElementType();
+// Given a resume function @f.resume(%f.frame* %frame), returns the size
+// and expected alignment of %f.frame type.
+static std::pair<uint64_t, Align> getFrameLayout(Function *Resume) {
+  // Prefer to pull information from the function attributes.
+  auto Size = Resume->getParamDereferenceableBytes(0);
+  auto Align = Resume->getParamAlign(0);
+
+  // If those aren't given, extract them from the type.
+  if (Size == 0 || !Align) {
+    auto *FrameTy = Resume->arg_begin()->getType()->getPointerElementType();
+
+    const DataLayout &DL = Resume->getParent()->getDataLayout();
+    if (!Size) Size = DL.getTypeAllocSize(FrameTy);
+    if (!Align) Align = DL.getABITypeAlign(FrameTy);
+  }
+
+  return std::make_pair(Size, *Align);
 }
 
 // Finds first non alloca instruction in the entry block of a function.
@@ -106,8 +122,9 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) {
 
 // To elide heap allocations we need to suppress code blocks guarded by
 // llvm.coro.alloc and llvm.coro.free instructions.
-void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
-  LLVMContext &C = FrameTy->getContext();
+void Lowerer::elideHeapAllocations(Function *F, uint64_t FrameSize,
+                                   Align FrameAlign, AAResults &AA) {
+  LLVMContext &C = F->getContext();
   auto *InsertPt =
       getFirstNonAllocaInTheEntryBlock(CoroIds.front()->getFunction());
 
@@ -128,7 +145,9 @@ void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
   // here. Possibly we will need to do a mini SROA here and break the coroutine
   // frame into individual AllocaInst recreating the original alignment.
   const DataLayout &DL = F->getParent()->getDataLayout();
+  auto FrameTy = ArrayType::get(Type::getInt8Ty(C), FrameSize);
   auto *Frame = new AllocaInst(FrameTy, DL.getAllocaAddrSpace(), "", InsertPt);
+  Frame->setAlignment(FrameAlign);
   auto *FrameVoidPtr =
       new BitCastInst(Frame, Type::getInt8PtrTy(C), "vFrame", InsertPt);
 
@@ -142,44 +161,92 @@ void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
   removeTailCallAttribute(Frame, AA);
 }
 
+bool Lowerer::hasEscapePath(const CoroBeginInst *CB,
+                            const SmallPtrSetImpl<BasicBlock *> &TIs) const {
+  const auto &It = DestroyAddr.find(CB);
+  assert(It != DestroyAddr.end());
+
+  // Limit the number of blocks we visit.
+  unsigned Limit = 32 * (1 + It->second.size());
+
+  SmallVector<const BasicBlock *, 32> Worklist;
+  Worklist.push_back(CB->getParent());
+
+  SmallPtrSet<const BasicBlock *, 32> Visited;
+  // Consider basicblock of coro.destroy as visited one, so that we
+  // skip the path pass through coro.destroy.
+  for (auto *DA : It->second)
+    Visited.insert(DA->getParent());
+
+  do {
+    const auto *BB = Worklist.pop_back_val();
+    if (!Visited.insert(BB).second)
+      continue;
+    if (TIs.count(BB))
+      return true;
+
+    // Conservatively say that there is potentially a path.
+    if (!--Limit)
+      return true;
+
+    auto TI = BB->getTerminator();
+    // Although the default dest of coro.suspend switches is suspend pointer
+    // which means a escape path to normal terminator, it is reasonable to skip
+    // it since coroutine frame doesn't change outside the coroutine body.
+    if (isa<SwitchInst>(TI) &&
+        CoroSuspendSwitches.count(cast<SwitchInst>(TI))) {
+      Worklist.push_back(cast<SwitchInst>(TI)->getSuccessor(1));
+      Worklist.push_back(cast<SwitchInst>(TI)->getSuccessor(2));
+    } else
+      Worklist.append(succ_begin(BB), succ_end(BB));
+
+  } while (!Worklist.empty());
+
+  // We have exhausted all possible paths and are certain that coro.begin can
+  // not reach to any of terminators.
+  return false;
+}
+
 bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   // If no CoroAllocs, we cannot suppress allocation, so elision is not
   // possible.
   if (CoroAllocs.empty())
     return false;
 
-  // Check that for every coro.begin there is a coro.destroy directly
-  // referencing the SSA value of that coro.begin along a non-exceptional path.
+  // Check that for every coro.begin there is at least one coro.destroy directly
+  // referencing the SSA value of that coro.begin along each
+  // non-exceptional path.
   // If the value escaped, then coro.destroy would have been referencing a
   // memory location storing that value and not the virtual register.
 
+  SmallPtrSet<BasicBlock *, 8> Terminators;
   // First gather all of the non-exceptional terminators for the function.
-  SmallPtrSet<Instruction *, 8> Terminators;
-  for (BasicBlock &B : *F) {
-    auto *TI = B.getTerminator();
-    if (TI->getNumSuccessors() == 0 && !TI->isExceptionalTerminator() &&
-        !isa<UnreachableInst>(TI))
-      Terminators.insert(TI);
-  }
+  // Consider the final coro.suspend as the real terminator when the current
+  // function is a coroutine.
+    for (BasicBlock &B : *F) {
+      auto *TI = B.getTerminator();
+      if (TI->getNumSuccessors() == 0 && !TI->isExceptionalTerminator() &&
+          !isa<UnreachableInst>(TI))
+        Terminators.insert(&B);
+    }
 
   // Filter out the coro.destroy that lie along exceptional paths.
-  SmallPtrSet<CoroSubFnInst *, 4> DAs;
-  for (CoroSubFnInst *DA : DestroyAddr) {
-    for (Instruction *TI : Terminators) {
-      if (DT.dominates(DA, TI)) {
-        DAs.insert(DA);
-        break;
+  SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
+  for (auto &It : DestroyAddr) {
+    for (Instruction *DA : It.second) {
+      for (BasicBlock *TI : Terminators) {
+        if (DT.dominates(DA, TI->getTerminator())) {
+          ReferencedCoroBegins.insert(It.first);
+          break;
+        }
       }
     }
-  }
 
-  // Find all the coro.begin referenced by coro.destroy along happy paths.
-  SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
-  for (CoroSubFnInst *DA : DAs) {
-    if (auto *CB = dyn_cast<CoroBeginInst>(DA->getFrame()))
-      ReferencedCoroBegins.insert(CB);
-    else
-      return false;
+    // Whether there is any paths from coro.begin to Terminators which not pass
+    // through any of the coro.destroys.
+    if (!ReferencedCoroBegins.count(It.first) &&
+        !hasEscapePath(It.first, Terminators))
+      ReferencedCoroBegins.insert(It.first);
   }
 
   // If size of the set is the same as total number of coro.begin, that means we
@@ -188,6 +255,30 @@ bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   return ReferencedCoroBegins.size() == CoroBegins.size();
 }
 
+void Lowerer::collectPostSplitCoroIds(Function *F) {
+  CoroIds.clear();
+  CoroSuspendSwitches.clear();
+  for (auto &I : instructions(F)) {
+    if (auto *CII = dyn_cast<CoroIdInst>(&I))
+      if (CII->getInfo().isPostSplit())
+        // If it is the coroutine itself, don't touch it.
+        if (CII->getCoroutine() != CII->getFunction())
+          CoroIds.push_back(CII);
+
+    // Consider case like:
+    // %0 = call i8 @llvm.coro.suspend(...)
+    // switch i8 %0, label %suspend [i8 0, label %resume
+    //                              i8 1, label %cleanup]
+    // and collect the SwitchInsts which are used by escape analysis later.
+    if (auto *CSI = dyn_cast<CoroSuspendInst>(&I))
+      if (CSI->hasOneUse() && isa<SwitchInst>(CSI->use_begin()->getUser())) {
+        SwitchInst *SWI = cast<SwitchInst>(CSI->use_begin()->getUser());
+        if (SWI->getNumCases() == 2)
+          CoroSuspendSwitches.insert(SWI);
+      }
+  }
+}
+
 bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
                             DominatorTree &DT) {
   CoroBegins.clear();
@@ -218,7 +309,7 @@ bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
           ResumeAddr.push_back(II);
           break;
         case CoroSubFnInst::DestroyIndex:
-          DestroyAddr.push_back(II);
+          DestroyAddr[CB].push_back(II);
           break;
         default:
           llvm_unreachable("unexpected coro.subfn.addr constant");
@@ -241,11 +332,13 @@ bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
       Resumers,
       ShouldElide ? CoroSubFnInst::CleanupIndex : CoroSubFnInst::DestroyIndex);
 
-  replaceWithConstant(DestroyAddrConstant, DestroyAddr);
+  for (auto &It : DestroyAddr)
+    replaceWithConstant(DestroyAddrConstant, It.second);
 
   if (ShouldElide) {
-    auto *FrameTy = getFrameType(cast<Function>(ResumeAddrConstant));
-    elideHeapAllocations(CoroId->getFunction(), FrameTy, AA);
+    auto FrameSizeAndAlign = getFrameLayout(cast<Function>(ResumeAddrConstant));
+    elideHeapAllocations(CoroId->getFunction(), FrameSizeAndAlign.first,
+                         FrameSizeAndAlign.second, AA);
     coro::replaceCoroFree(CoroId, /*Elide=*/true);
   }
 
@@ -272,9 +365,31 @@ static bool replaceDevirtTrigger(Function &F) {
   return true;
 }
 
-//===----------------------------------------------------------------------===//
-//                              Top Level Driver
-//===----------------------------------------------------------------------===//
+static bool declaresCoroElideIntrinsics(Module &M) {
+  return coro::declaresIntrinsics(M, {"llvm.coro.id"});
+}
+
+PreservedAnalyses CoroElidePass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &M = *F.getParent();
+  if (!declaresCoroElideIntrinsics(M))
+    return PreservedAnalyses::all();
+
+  Lowerer L(M);
+  L.CoroIds.clear();
+  L.collectPostSplitCoroIds(&F);
+  // If we did not find any coro.id, there is nothing to do.
+  if (L.CoroIds.empty())
+    return PreservedAnalyses::all();
+
+  AAResults &AA = AM.getResult<AAManager>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  bool Changed = false;
+  for (auto *CII : L.CoroIds)
+    Changed |= L.processCoroId(CII, AA, DT);
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
 
 namespace {
 struct CoroElideLegacy : FunctionPass {
@@ -286,7 +401,7 @@ struct CoroElideLegacy : FunctionPass {
   std::unique_ptr<Lowerer> L;
 
   bool doInitialization(Module &M) override {
-    if (coro::declaresIntrinsics(M, {"llvm.coro.id"}))
+    if (declaresCoroElideIntrinsics(M))
       L = std::make_unique<Lowerer>(M);
     return false;
   }
@@ -301,15 +416,7 @@ struct CoroElideLegacy : FunctionPass {
       Changed = replaceDevirtTrigger(F);
 
     L->CoroIds.clear();
-
-    // Collect all PostSplit coro.ids.
-    for (auto &I : instructions(F))
-      if (auto *CII = dyn_cast<CoroIdInst>(&I))
-        if (CII->getInfo().isPostSplit())
-          // If it is the coroutine itself, don't touch it.
-          if (CII->getCoroutine() != CII->getFunction())
-            L->CoroIds.push_back(CII);
-
+    L->collectPostSplitCoroIds(&F);
     // If we did not find any coro.id, there is nothing to do.
     if (L->CoroIds.empty())
       return Changed;
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 2c42cf8a6d25..f55501a05d85 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -18,18 +18,22 @@
 
 #include "CoroInternal.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/circular_raw_ostream.h"
+#include "llvm/Support/OptimizedStructLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -105,7 +109,6 @@ struct SuspendCrossingInfo {
     size_t const DefIndex = Mapping.blockToIndex(DefBB);
     size_t const UseIndex = Mapping.blockToIndex(UseBB);
 
-    assert(Block[UseIndex].Consumes[DefIndex] && "use must consume def");
     bool const Result = Block[UseIndex].Kills[DefIndex];
     LLVM_DEBUG(dbgs() << UseBB->getName() << " => " << DefBB->getName()
                       << " answer is " << Result << "\n");
@@ -338,52 +341,182 @@ namespace {
 // coroutine frame and if the alignment specified on the Alloca instruction
 // differs from the natural alignment of the alloca type we will need to insert
 // padding.
-struct PaddingCalculator {
+class FrameTypeBuilder {
+  struct Field {
+    uint64_t Size;
+    uint64_t Offset;
+    Spill *ForSpill;
+    Type *Ty;
+    unsigned FieldIndex;
+    Align Alignment;
+    Align TyAlignment;
+  };
+
   const DataLayout &DL;
   LLVMContext &Context;
-  unsigned StructSize = 0;
+  uint64_t StructSize = 0;
+  Align StructAlign;
+  bool IsFinished = false;
 
-  PaddingCalculator(LLVMContext &Context, DataLayout const &DL)
-      : DL(DL), Context(Context) {}
+  SmallVector<Field, 8> Fields;
+  DenseMap<Value*, unsigned> FieldIndexByKey;
 
-  // Replicate the logic from IR/DataLayout.cpp to match field offset
-  // computation for LLVM structs.
-  void addType(Type *Ty) {
-    unsigned TyAlign = DL.getABITypeAlignment(Ty);
-    if ((StructSize & (TyAlign - 1)) != 0)
-      StructSize = alignTo(StructSize, TyAlign);
+public:
+  FrameTypeBuilder(LLVMContext &Context, DataLayout const &DL)
+    : DL(DL), Context(Context) {}
 
-    StructSize += DL.getTypeAllocSize(Ty); // Consume space for this data item.
-  }
+  class FieldId {
+    size_t Value;
+    explicit FieldId(size_t Value) : Value(Value) {}
+
+    friend class FrameTypeBuilder;
+  };
+
+  /// Add a field to this structure for the storage of an `alloca`
+  /// instruction.
+  FieldId addFieldForAlloca(AllocaInst *AI, Spill *ForSpill = nullptr,
+                            bool IsHeader = false) {
+    Type *Ty = AI->getAllocatedType();
+
+    // Make an array type if this is a static array allocation.
+    if (AI->isArrayAllocation()) {
+      if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
+        Ty = ArrayType::get(Ty, CI->getValue().getZExtValue());
+      else
+        report_fatal_error("Coroutines cannot handle non static allocas yet");
+    }
 
-  void addTypes(SmallVectorImpl<Type *> const &Types) {
-    for (auto *Ty : Types)
-      addType(Ty);
+    return addField(Ty, AI->getAlign(), ForSpill, IsHeader);
   }
 
-  unsigned computePadding(Type *Ty, unsigned ForcedAlignment) {
-    unsigned TyAlign = DL.getABITypeAlignment(Ty);
-    auto Natural = alignTo(StructSize, TyAlign);
-    auto Forced = alignTo(StructSize, ForcedAlignment);
+  /// Add a field to this structure.
+  FieldId addField(Type *Ty, MaybeAlign FieldAlignment,
+                   Spill *ForSpill = nullptr,
+                   bool IsHeader = false) {
+    assert(!IsFinished && "adding fields to a finished builder");
+    assert(Ty && "must provide a type for a field");
 
-    // Return how many bytes of padding we need to insert.
-    if (Natural != Forced)
-      return std::max(Natural, Forced) - StructSize;
+    // The field size is always the alloc size of the type.
+    uint64_t FieldSize = DL.getTypeAllocSize(Ty);
 
-    // Rely on natural alignment.
-    return 0;
+    // The field alignment might not be the type alignment, but we need
+    // to remember the type alignment anyway to build the type.
+    Align TyAlignment = DL.getABITypeAlign(Ty);
+    if (!FieldAlignment) FieldAlignment = TyAlignment;
+
+    // Lay out header fields immediately.
+    uint64_t Offset;
+    if (IsHeader) {
+      Offset = alignTo(StructSize, FieldAlignment);
+      StructSize = Offset + FieldSize;
+
+    // Everything else has a flexible offset.
+    } else {
+      Offset = OptimizedStructLayoutField::FlexibleOffset;
+    }
+
+    Fields.push_back({FieldSize, Offset, ForSpill, Ty, 0,
+                      *FieldAlignment, TyAlignment});
+    return FieldId(Fields.size() - 1);
+  }
+
+  /// Finish the layout and set the body on the given type.
+  void finish(StructType *Ty);
+
+  uint64_t getStructSize() const {
+    assert(IsFinished && "not yet finished!");
+    return StructSize;
   }
 
-  // If padding required, return the padding field type to insert.
-  ArrayType *getPaddingType(Type *Ty, unsigned ForcedAlignment) {
-    if (auto Padding = computePadding(Ty, ForcedAlignment))
-      return ArrayType::get(Type::getInt8Ty(Context), Padding);
+  Align getStructAlign() const {
+    assert(IsFinished && "not yet finished!");
+    return StructAlign;
+  }
 
-    return nullptr;
+  unsigned getFieldIndex(FieldId Id) const {
+    assert(IsFinished && "not yet finished!");
+    return Fields[Id.Value].FieldIndex;
   }
 };
 } // namespace
 
+void FrameTypeBuilder::finish(StructType *Ty) {
+  assert(!IsFinished && "already finished!");
+
+  // Prepare the optimal-layout field array.
+  // The Id in the layout field is a pointer to our Field for it.
+  SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
+  LayoutFields.reserve(Fields.size());
+  for (auto &Field : Fields) {
+    LayoutFields.emplace_back(&Field, Field.Size, Field.Alignment,
+                              Field.Offset);
+  }
+
+  // Perform layout.
+  auto SizeAndAlign = performOptimizedStructLayout(LayoutFields);
+  StructSize = SizeAndAlign.first;
+  StructAlign = SizeAndAlign.second;
+
+  auto getField = [](const OptimizedStructLayoutField &LayoutField) -> Field & {
+    return *static_cast<Field *>(const_cast<void*>(LayoutField.Id));
+  };
+
+  // We need to produce a packed struct type if there's a field whose
+  // assigned offset isn't a multiple of its natural type alignment.
+  bool Packed = [&] {
+    for (auto &LayoutField : LayoutFields) {
+      auto &F = getField(LayoutField);
+      if (!isAligned(F.TyAlignment, LayoutField.Offset))
+        return true;
+    }
+    return false;
+  }();
+
+  // Build the struct body.
+  SmallVector<Type*, 16> FieldTypes;
+  FieldTypes.reserve(LayoutFields.size() * 3 / 2);
+  uint64_t LastOffset = 0;
+  for (auto &LayoutField : LayoutFields) {
+    auto &F = getField(LayoutField);
+
+    auto Offset = LayoutField.Offset;
+
+    // Add a padding field if there's a padding gap and we're either
+    // building a packed struct or the padding gap is more than we'd
+    // get from aligning to the field type's natural alignment.
+    assert(Offset >= LastOffset);
+    if (Offset != LastOffset) {
+      if (Packed || alignTo(LastOffset, F.TyAlignment) != Offset)
+        FieldTypes.push_back(ArrayType::get(Type::getInt8Ty(Context),
+                                            Offset - LastOffset));
+    }
+
+    // Record the layout information into both the Field and the
+    // original Spill, if there is one.
+    F.Offset = Offset;
+    F.FieldIndex = FieldTypes.size();
+    if (F.ForSpill) {
+      F.ForSpill->setFieldIndex(F.FieldIndex);
+    }
+
+    FieldTypes.push_back(F.Ty);
+    LastOffset = Offset + F.Size;
+  }
+
+  Ty->setBody(FieldTypes, Packed);
+
+#ifndef NDEBUG
+  // Check that the IR layout matches the offsets we expect.
+  auto Layout = DL.getStructLayout(Ty);
+  for (auto &F : Fields) {
+    assert(Ty->getElementType(F.FieldIndex) == F.Ty);
+    assert(Layout->getElementOffset(F.FieldIndex) == F.Offset);
+  }
+#endif
+
+  IsFinished = true;
+}
+
 // Build a struct that will keep state for an active coroutine.
 //   struct f.frame {
 //     ResumeFnTy ResumeFnAddr;
@@ -396,13 +529,17 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
                                   SpillInfo &Spills) {
   LLVMContext &C = F.getContext();
   const DataLayout &DL = F.getParent()->getDataLayout();
-  PaddingCalculator Padder(C, DL);
-  SmallString<32> Name(F.getName());
-  Name.append(".Frame");
-  StructType *FrameTy = StructType::create(C, Name);
-  SmallVector<Type *, 8> Types;
+  StructType *FrameTy = [&] {
+    SmallString<32> Name(F.getName());
+    Name.append(".Frame");
+    return StructType::create(C, Name);
+  }();
+
+  FrameTypeBuilder B(C, DL);
 
   AllocaInst *PromiseAlloca = Shape.getPromiseAlloca();
+  Optional<FrameTypeBuilder::FieldId> PromiseFieldId;
+  Optional<FrameTypeBuilder::FieldId> SwitchIndexFieldId;
 
   if (Shape.ABI == coro::ABI::Switch) {
     auto *FramePtrTy = FrameTy->getPointerTo();
@@ -410,74 +547,74 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
                                    /*IsVarArg=*/false);
     auto *FnPtrTy = FnTy->getPointerTo();
 
-    // Figure out how wide should be an integer type storing the suspend index.
+    // Add header fields for the resume and destroy functions.
+    // We can rely on these being perfectly packed.
+    B.addField(FnPtrTy, None, nullptr, /*header*/ true);
+    B.addField(FnPtrTy, None, nullptr, /*header*/ true);
+
+    // Add a header field for the promise if there is one.
+    if (PromiseAlloca) {
+      PromiseFieldId =
+        B.addFieldForAlloca(PromiseAlloca, nullptr, /*header*/ true);
+    }
+
+    // Add a field to store the suspend index.  This doesn't need to
+    // be in the header.
     unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size()));
-    Type *PromiseType = PromiseAlloca
-                            ? PromiseAlloca->getType()->getElementType()
-                            : Type::getInt1Ty(C);
     Type *IndexType = Type::getIntNTy(C, IndexBits);
-    Types.push_back(FnPtrTy);
-    Types.push_back(FnPtrTy);
-    Types.push_back(PromiseType);
-    Types.push_back(IndexType);
+
+    SwitchIndexFieldId = B.addField(IndexType, None);
   } else {
     assert(PromiseAlloca == nullptr && "lowering doesn't support promises");
   }
 
   Value *CurrentDef = nullptr;
 
-  Padder.addTypes(Types);
-
   // Create an entry for every spilled value.
   for (auto &S : Spills) {
+    // We can have multiple entries in Spills for a single value, but
+    // they should form a contiguous run.  Ignore all but the first.
     if (CurrentDef == S.def())
       continue;
 
     CurrentDef = S.def();
-    // PromiseAlloca was already added to Types array earlier.
-    if (CurrentDef == PromiseAlloca)
-      continue;
 
-    uint64_t Count = 1;
-    Type *Ty = nullptr;
+    assert(CurrentDef != PromiseAlloca &&
+           "recorded spill use of promise alloca?");
+
     if (auto *AI = dyn_cast<AllocaInst>(CurrentDef)) {
-      Ty = AI->getAllocatedType();
-      if (unsigned AllocaAlignment = AI->getAlignment()) {
-        // If alignment is specified in alloca, see if we need to insert extra
-        // padding.
-        if (auto PaddingTy = Padder.getPaddingType(Ty, AllocaAlignment)) {
-          Types.push_back(PaddingTy);
-          Padder.addType(PaddingTy);
-        }
-      }
-      if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
-        Count = CI->getValue().getZExtValue();
-      else
-        report_fatal_error("Coroutines cannot handle non static allocas yet");
+      B.addFieldForAlloca(AI, &S);
     } else {
-      Ty = CurrentDef->getType();
+      Type *Ty = CurrentDef->getType();
+      B.addField(Ty, None, &S);
     }
-    S.setFieldIndex(Types.size());
-    if (Count == 1)
-      Types.push_back(Ty);
-    else
-      Types.push_back(ArrayType::get(Ty, Count));
-    Padder.addType(Ty);
   }
-  FrameTy->setBody(Types);
+
+  B.finish(FrameTy);
+  Shape.FrameAlign = B.getStructAlign();
+  Shape.FrameSize = B.getStructSize();
 
   switch (Shape.ABI) {
+  // In the switch ABI, remember the field indices for the promise and
+  // switch-index fields.
   case coro::ABI::Switch:
+    Shape.SwitchLowering.IndexField =
+      B.getFieldIndex(*SwitchIndexFieldId);
+    Shape.SwitchLowering.PromiseField =
+      (PromiseAlloca ? B.getFieldIndex(*PromiseFieldId) : 0);
+
+    // Also round the frame size up to a multiple of its alignment, as is
+    // generally expected in C/C++.
+    Shape.FrameSize = alignTo(Shape.FrameSize, Shape.FrameAlign);
     break;
 
-  // Remember whether the frame is inline in the storage.
+  // In the retcon ABI, remember whether the frame is inline in the storage.
   case coro::ABI::Retcon:
   case coro::ABI::RetconOnce: {
-    auto &Layout = F.getParent()->getDataLayout();
     auto Id = Shape.getRetconCoroId();
     Shape.RetconLowering.IsFrameInlineInStorage
-      = (Layout.getTypeAllocSize(FrameTy) <= Id->getStorageSize() &&
-         Layout.getABITypeAlignment(FrameTy) <= Id->getStorageAlignment());
+      = (B.getStructSize() <= Id->getStorageSize() &&
+         B.getStructAlign() <= Id->getStorageAlignment());
     break;
   }
   }
@@ -606,10 +743,12 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
   // we remember allocas and their indices to be handled once we processed
   // all the spills.
   SmallVector<std::pair<AllocaInst *, unsigned>, 4> Allocas;
-  // Promise alloca (if present) has a fixed field number.
+
+  // Promise alloca (if present) doesn't show in the spills and has a
+  // special field number.
   if (auto *PromiseAlloca = Shape.getPromiseAlloca()) {
     assert(Shape.ABI == coro::ABI::Switch);
-    Allocas.emplace_back(PromiseAlloca, coro::Shape::SwitchFieldIndex::Promise);
+    Allocas.emplace_back(PromiseAlloca, Shape.getPromiseField());
   }
 
   // Create a GEP with the given index into the coroutine frame for the original
@@ -636,12 +775,12 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
   };
 
   // Create a load instruction to reload the spilled value from the coroutine
-  // frame.
-  auto CreateReload = [&](Instruction *InsertBefore) {
+  // frame. Populates the Value pointer reference provided with the frame GEP.
+  auto CreateReload = [&](Instruction *InsertBefore, Value *&G) {
     assert(Index != InvalidFieldIndex && "accessing unassigned field number");
     Builder.SetInsertPoint(InsertBefore);
 
-    auto *G = GetFramePointer(Index, CurrentValue);
+    G = GetFramePointer(Index, CurrentValue);
     G->setName(CurrentValue->getName() + Twine(".reload.addr"));
 
     return isa<AllocaInst>(CurrentValue)
@@ -650,6 +789,7 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
                                     CurrentValue->getName() + Twine(".reload"));
   };
 
+  Value *GEP = nullptr, *CurrentGEP = nullptr;
   for (auto const &E : Spills) {
     // If we have not seen the value, generate a spill.
     if (CurrentValue != E.def()) {
@@ -722,7 +862,7 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
     // If we have not seen the use block, generate a reload in it.
     if (CurrentBlock != E.userBlock()) {
       CurrentBlock = E.userBlock();
-      CurrentReload = CreateReload(&*CurrentBlock->getFirstInsertionPt());
+      CurrentReload = CreateReload(&*CurrentBlock->getFirstInsertionPt(), GEP);
     }
 
     // If we have a single edge PHINode, remove it and replace it with a reload
@@ -736,6 +876,19 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
       continue;
     }
 
+    // If we have not seen this GEP instruction, migrate any dbg.declare from
+    // the alloca to it.
+    if (CurrentGEP != GEP) {
+      CurrentGEP = GEP;
+      TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(CurrentValue);
+      if (!DIs.empty())
+        DIBuilder(*CurrentBlock->getParent()->getParent(),
+                  /*AllowUnresolved*/ false)
+            .insertDeclare(CurrentGEP, DIs.front()->getVariable(),
+                           DIs.front()->getExpression(),
+                           DIs.front()->getDebugLoc(), DIs.front());
+    }
+
     // Replace all uses of CurrentValue in the current instruction with reload.
     E.user()->replaceUsesOfWith(CurrentValue, CurrentReload);
   }
@@ -746,14 +899,38 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
     FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");      
   SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill");
   Shape.AllocaSpillBlock = SpillBlock;
-  // If we found any allocas, replace all of their remaining uses with Geps.
-  // Note: we cannot do it indiscriminately as some of the uses may not be
-  // dominated by CoroBegin.
+
+  // retcon and retcon.once lowering assumes all uses have been sunk.
+  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce) {
+    // If we found any allocas, replace all of their remaining uses with Geps.
+    Builder.SetInsertPoint(&SpillBlock->front());
+    for (auto &P : Allocas) {
+      auto *G = GetFramePointer(P.second, P.first);
+
+      // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G))
+      // here, as we are changing location of the instruction.
+      G->takeName(P.first);
+      P.first->replaceAllUsesWith(G);
+      P.first->eraseFromParent();
+    }
+    return FramePtr;
+  }
+
+  // If we found any alloca, replace all of their remaining uses with GEP
+  // instructions. Because new dbg.declare have been created for these alloca,
+  // we also delete the original dbg.declare and replace other uses with undef.
+  // Note: We cannot replace the alloca with GEP instructions indiscriminately,
+  // as some of the uses may not be dominated by CoroBegin.
   bool MightNeedToCopy = false;
   Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front());
   SmallVector<Instruction *, 4> UsersToUpdate;
   for (auto &P : Allocas) {
     AllocaInst *const A = P.first;
+
+    for (auto *DI : FindDbgDeclareUses(A))
+      DI->eraseFromParent();
+    replaceDbgUsesWithUndef(A);
+
     UsersToUpdate.clear();
     for (User *U : A->users()) {
       auto *I = cast<Instruction>(U);
@@ -784,7 +961,7 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
               "Coroutines cannot handle copying of array allocas yet");
 
         auto *G = GetFramePointer(P.second, A);
-        auto *Value = Builder.CreateLoad(A);
+        auto *Value = Builder.CreateLoad(A->getAllocatedType(), A);
         Builder.CreateStore(Value, G);
       }
     }
@@ -1106,7 +1283,7 @@ static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas,
 
     // Allocate memory.
     auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize());
-    Alloca->setAlignment(MaybeAlign(AI->getAlignment()));
+    Alloca->setAlignment(Align(AI->getAlignment()));
 
     for (auto U : AI->users()) {
       // Replace gets with the allocation.
@@ -1166,7 +1343,7 @@ static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy,
   auto FnTy = FunctionType::get(ValueTy, {}, false);
   auto Fn = ConstantPointerNull::get(FnTy->getPointerTo());
 
-  auto Call = Builder.CreateCall(Fn, {});
+  auto Call = Builder.CreateCall(FnTy, Fn, {});
   Shape.SwiftErrorOps.push_back(Call);
 
   return Call;
@@ -1182,7 +1359,7 @@ static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V,
                                 {V->getType()}, false);
   auto Fn = ConstantPointerNull::get(FnTy->getPointerTo());
 
-  auto Call = Builder.CreateCall(Fn, { V });
+  auto Call = Builder.CreateCall(FnTy, Fn, { V });
   Shape.SwiftErrorOps.push_back(Call);
 
   return Call;
@@ -1322,11 +1499,125 @@ static void eliminateSwiftError(Function &F, coro::Shape &Shape) {
   }
 }
 
-void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
-  // Lower coro.dbg.declare to coro.dbg.value, since we are going to rewrite
-  // access to local variables.
-  LowerDbgDeclare(F);
+/// retcon and retcon.once conventions assume that all spill uses can be sunk
+/// after the coro.begin intrinsic.
+static void sinkSpillUsesAfterCoroBegin(Function &F, const SpillInfo &Spills,
+                                        CoroBeginInst *CoroBegin) {
+  DominatorTree Dom(F);
+
+  SmallSetVector<Instruction *, 32> ToMove;
+  SmallVector<Instruction *, 32> Worklist;
+
+  // Collect all users that precede coro.begin.
+  for (auto const &Entry : Spills) {
+    auto *SpillDef = Entry.def();
+    for (User *U : SpillDef->users()) {
+      auto Inst = cast<Instruction>(U);
+      if (Inst->getParent() != CoroBegin->getParent() ||
+          Dom.dominates(CoroBegin, Inst))
+        continue;
+      if (ToMove.insert(Inst))
+        Worklist.push_back(Inst);
+    }
+  }
+  // Recursively collect users before coro.begin.
+  while (!Worklist.empty()) {
+    auto *Def = Worklist.back();
+    Worklist.pop_back();
+    for (User *U : Def->users()) {
+      auto Inst = cast<Instruction>(U);
+      if (Dom.dominates(CoroBegin, Inst))
+        continue;
+      if (ToMove.insert(Inst))
+        Worklist.push_back(Inst);
+    }
+  }
+
+  // Sort by dominance.
+  SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end());
+  std::sort(InsertionList.begin(), InsertionList.end(),
+            [&Dom](Instruction *A, Instruction *B) -> bool {
+              // If a dominates b it should preceed (<) b.
+              return Dom.dominates(A, B);
+            });
 
+  Instruction *InsertPt = CoroBegin->getNextNode();
+  for (Instruction *Inst : InsertionList)
+    Inst->moveBefore(InsertPt);
+
+  return;
+}
+
+/// For each local variable that all of its user are only used inside one of
+/// suspended region, we sink their lifetime.start markers to the place where
+/// after the suspend block. Doing so minimizes the lifetime of each variable,
+/// hence minimizing the amount of data we end up putting on the frame.
+static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
+                                     SuspendCrossingInfo &Checker) {
+  DominatorTree DT(F);
+
+  // Collect all possible basic blocks which may dominate all uses of allocas.
+  SmallPtrSet<BasicBlock *, 4> DomSet;
+  DomSet.insert(&F.getEntryBlock());
+  for (auto *CSI : Shape.CoroSuspends) {
+    BasicBlock *SuspendBlock = CSI->getParent();
+    assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() &&
+           "should have split coro.suspend into its own block");
+    DomSet.insert(SuspendBlock->getSingleSuccessor());
+  }
+
+  for (Instruction &I : instructions(F)) {
+    if (!isa<AllocaInst>(&I))
+      continue;
+
+    for (BasicBlock *DomBB : DomSet) {
+      bool Valid = true;
+      SmallVector<Instruction *, 1> BCInsts;
+
+      auto isUsedByLifetimeStart = [&](Instruction *I) {
+        if (isa<BitCastInst>(I) && I->hasOneUse())
+          if (auto *IT = dyn_cast<IntrinsicInst>(I->user_back()))
+            return IT->getIntrinsicID() == Intrinsic::lifetime_start;
+        return false;
+      };
+
+      for (User *U : I.users()) {
+        Instruction *UI = cast<Instruction>(U);
+        // For all users except lifetime.start markers, if they are all
+        // dominated by one of the basic blocks and do not cross
+        // suspend points as well, then there is no need to spill the
+        // instruction.
+        if (!DT.dominates(DomBB, UI->getParent()) ||
+            Checker.isDefinitionAcrossSuspend(DomBB, U)) {
+          // Skip bitcast used by lifetime.start markers.
+          if (isUsedByLifetimeStart(UI)) {
+            BCInsts.push_back(UI);
+            continue;
+          }
+          Valid = false;
+          break;
+        }
+      }
+      // Sink lifetime.start markers to dominate block when they are
+      // only used outside the region.
+      if (Valid && BCInsts.size() != 0) {
+        auto *NewBitcast = BCInsts[0]->clone();
+        auto *NewLifetime = cast<Instruction>(BCInsts[0]->user_back())->clone();
+        NewLifetime->replaceUsesOfWith(BCInsts[0], NewBitcast);
+        NewBitcast->insertBefore(DomBB->getTerminator());
+        NewLifetime->insertBefore(DomBB->getTerminator());
+
+        // All the outsided lifetime.start markers are no longer necessary.
+        for (Instruction *S : BCInsts) {
+          S->user_back()->eraseFromParent();
+        }
+        break;
+      }
+    }
+  }
+}
+
+void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   eliminateSwiftError(F, Shape);
 
   if (Shape.ABI == coro::ABI::Switch &&
@@ -1376,6 +1667,25 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
     Spills.clear();
   }
 
+  sinkLifetimeStartMarkers(F, Shape, Checker);
+  // Collect lifetime.start info for each alloca.
+  using LifetimeStart = SmallPtrSet<Instruction *, 2>;
+  llvm::DenseMap<Instruction *, std::unique_ptr<LifetimeStart>> LifetimeMap;
+  for (Instruction &I : instructions(F)) {
+    auto *II = dyn_cast<IntrinsicInst>(&I);
+    if (!II || II->getIntrinsicID() != Intrinsic::lifetime_start)
+      continue;
+
+    if (auto *OpInst = dyn_cast<BitCastInst>(I.getOperand(1)))
+      if (auto *AI = dyn_cast<AllocaInst>(OpInst->getOperand(0))) {
+
+        if (LifetimeMap.find(AI) == LifetimeMap.end())
+          LifetimeMap[AI] = std::make_unique<LifetimeStart>();
+
+        LifetimeMap[AI]->insert(OpInst);
+      }
+  }
+
   // Collect the spills for arguments and other not-materializable values.
   for (Argument &A : F.args())
     for (User *U : A.users())
@@ -1421,16 +1731,31 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
       continue;
     }
 
-    for (User *U : I.users())
-      if (Checker.isDefinitionAcrossSuspend(I, U)) {
+    auto Iter = LifetimeMap.find(&I);
+    for (User *U : I.users()) {
+      bool NeedSpill = false;
+
+      // Check against lifetime.start if the instruction has the info.
+      if (Iter != LifetimeMap.end())
+        for (auto *S : *Iter->second) {
+          if ((NeedSpill = Checker.isDefinitionAcrossSuspend(*S, U)))
+            break;
+        }
+      else
+        NeedSpill = Checker.isDefinitionAcrossSuspend(I, U);
+
+      if (NeedSpill) {
         // We cannot spill a token.
         if (I.getType()->isTokenTy())
           report_fatal_error(
               "token definition is separated from the use by a suspend point");
         Spills.emplace_back(&I, U);
       }
+    }
   }
   LLVM_DEBUG(dump("Spills", Spills));
+  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce)
+    sinkSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
   Shape.FrameTy = buildFrameType(F, Shape, Spills);
   Shape.FramePtr = insertSpills(Spills, Shape);
   lowerLocalAllocas(LocalAllocas, DeadInstructions);
diff --git a/llvm/lib/Transforms/Coroutines/CoroInstr.h b/llvm/lib/Transforms/Coroutines/CoroInstr.h
index de2d2920cb15..320137526db8 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInstr.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -211,8 +211,8 @@ public:
     return cast<ConstantInt>(getArgOperand(SizeArg))->getZExtValue();
   }
 
-  uint64_t getStorageAlignment() const {
-    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+  Align getStorageAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getAlignValue();
   }
 
   Value *getStorage() const {
@@ -338,11 +338,16 @@ class LLVM_LIBRARY_VISIBILITY CoroPromiseInst : public IntrinsicInst {
   enum { FrameArg, AlignArg, FromArg };
 
 public:
+  /// Are we translating from the frame to the promise (false) or from
+  /// the promise to the frame (true)?
   bool isFromPromise() const {
     return cast<Constant>(getArgOperand(FromArg))->isOneValue();
   }
-  unsigned getAlignment() const {
-    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+
+  /// The required alignment of the promise.  This must match the
+  /// alignment of the promise alloca in the coroutine.
+  Align getAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getAlignValue();
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
@@ -463,8 +468,8 @@ public:
   Value *getSize() const {
     return getArgOperand(SizeArg);
   }
-  unsigned getAlignment() const {
-    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+  Align getAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getAlignValue();
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 7eb35400c0d5..bd76e93c9124 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -96,17 +96,22 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   struct SwitchFieldIndex {
     enum {
       Resume,
-      Destroy,
-      Promise,
-      Index,
-      /// The index of the first spill field.
-      FirstSpill
+      Destroy
+
+      // The promise field is always at a fixed offset from the start of
+      // frame given its type, but the index isn't a constant for all
+      // possible frames.
+
+      // The switch-index field isn't at a fixed offset or index, either;
+      // we just work it in where it fits best.
     };
   };
 
   coro::ABI ABI;
 
   StructType *FrameTy;
+  Align FrameAlign;
+  uint64_t FrameSize;
   Instruction *FramePtr;
   BasicBlock *AllocaSpillBlock;
 
@@ -114,6 +119,8 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     SwitchInst *ResumeSwitch;
     AllocaInst *PromiseAlloca;
     BasicBlock *ResumeEntryBlock;
+    unsigned IndexField;
+    unsigned PromiseField;
     bool HasFinalSuspend;
   };
 
@@ -141,10 +148,15 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     return cast<AnyCoroIdRetconInst>(CoroBegin->getId());
   }
 
+  unsigned getSwitchIndexField() const {
+    assert(ABI == coro::ABI::Switch);
+    assert(FrameTy && "frame type not assigned");
+    return SwitchLowering.IndexField;
+  }
   IntegerType *getIndexType() const {
     assert(ABI == coro::ABI::Switch);
     assert(FrameTy && "frame type not assigned");
-    return cast<IntegerType>(FrameTy->getElementType(SwitchFieldIndex::Index));
+    return cast<IntegerType>(FrameTy->getElementType(getSwitchIndexField()));
   }
   ConstantInt *getIndex(uint64_t Value) const {
     return ConstantInt::get(getIndexType(), Value);
@@ -203,23 +215,17 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     llvm_unreachable("Unknown coro::ABI enum");
   }
 
-  unsigned getFirstSpillFieldIndex() const {
-    switch (ABI) {
-    case coro::ABI::Switch:
-      return SwitchFieldIndex::FirstSpill;
-
-    case coro::ABI::Retcon:
-    case coro::ABI::RetconOnce:
-      return 0;
-    }
-    llvm_unreachable("Unknown coro::ABI enum");
-  }
-
   AllocaInst *getPromiseAlloca() const {
     if (ABI == coro::ABI::Switch)
       return SwitchLowering.PromiseAlloca;
     return nullptr;
   }
+  unsigned getPromiseField() const {
+    assert(ABI == coro::ABI::Switch);
+    assert(FrameTy && "frame type not assigned");
+    assert(SwitchLowering.PromiseAlloca && "no promise alloca");
+    return SwitchLowering.PromiseField;
+  }
 
   /// Allocate memory according to the rules of the active lowering.
   ///
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 66cb3e74e53e..9c4392e7999b 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -18,6 +18,7 @@
 // coroutine.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Coroutines/CoroSplit.h"
 #include "CoroInstr.h"
 #include "CoroInternal.h"
 #include "llvm/ADT/DenseMap.h"
@@ -31,7 +32,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -59,6 +59,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
@@ -74,7 +75,7 @@ using namespace llvm;
 
 namespace {
 
-/// A little helper class for building 
+/// A little helper class for building
 class CoroCloner {
 public:
   enum class Kind {
@@ -283,7 +284,7 @@ static void createResumeEntryBlock(Function &F, coro::Shape &Shape) {
   auto *FramePtr = Shape.FramePtr;
   auto *FrameTy = Shape.FrameTy;
   auto *GepIndex = Builder.CreateStructGEP(
-      FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Index, "index.addr");
+      FrameTy, FramePtr, Shape.getSwitchIndexField(), "index.addr");
   auto *Index = Builder.CreateLoad(Shape.getIndexType(), GepIndex, "index");
   auto *Switch =
       Builder.CreateSwitch(Index, UnreachBB, Shape.CoroSuspends.size());
@@ -309,7 +310,7 @@ static void createResumeEntryBlock(Function &F, coro::Shape &Shape) {
       Builder.CreateStore(NullPtr, GepIndex);
     } else {
       auto *GepIndex = Builder.CreateStructGEP(
-          FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Index, "index.addr");
+          FrameTy, FramePtr, Shape.getSwitchIndexField(), "index.addr");
       Builder.CreateStore(IndexVal, GepIndex);
     }
     Save->replaceAllUsesWith(ConstantTokenNone::get(C));
@@ -562,11 +563,12 @@ void CoroCloner::replaceEntryBlock() {
   // In the original function, the AllocaSpillBlock is a block immediately
   // following the allocation of the frame object which defines GEPs for
   // all the allocas that have been moved into the frame, and it ends by
-  // branching to the original beginning of the coroutine.  Make this 
+  // branching to the original beginning of the coroutine.  Make this
   // the entry block of the cloned function.
   auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]);
+  auto *OldEntry = &NewF->getEntryBlock();
   Entry->setName("entry" + Suffix);
-  Entry->moveBefore(&NewF->getEntryBlock());
+  Entry->moveBefore(OldEntry);
   Entry->getTerminator()->eraseFromParent();
 
   // Clear all predecessors of the new entry block.  There should be
@@ -579,8 +581,14 @@ void CoroCloner::replaceEntryBlock() {
   Builder.CreateUnreachable();
   BranchToEntry->eraseFromParent();
 
-  // TODO: move any allocas into Entry that weren't moved into the frame.
-  // (Currently we move all allocas into the frame.)
+  // Move any allocas into Entry that weren't moved into the frame.
+  for (auto IT = OldEntry->begin(), End = OldEntry->end(); IT != End;) {
+    Instruction &I = *IT++;
+    if (!isa<AllocaInst>(&I) || I.use_empty())
+      continue;
+
+    I.moveBefore(*Entry, Entry->getFirstInsertionPt());
+  }
 
   // Branch from the entry to the appropriate place.
   Builder.SetInsertPoint(Entry);
@@ -630,12 +638,23 @@ Value *CoroCloner::deriveNewFramePointer() {
     // Otherwise, load the real frame from the opaque storage.
     auto FramePtrPtr =
       Builder.CreateBitCast(NewStorage, FramePtrTy->getPointerTo());
-    return Builder.CreateLoad(FramePtrPtr);
+    return Builder.CreateLoad(FramePtrTy, FramePtrPtr);
   }
   }
   llvm_unreachable("bad ABI");
 }
 
+static void addFramePointerAttrs(AttributeList &Attrs, LLVMContext &Context,
+                                 unsigned ParamIndex,
+                                 uint64_t Size, Align Alignment) {
+  AttrBuilder ParamAttrs;
+  ParamAttrs.addAttribute(Attribute::NonNull);
+  ParamAttrs.addAttribute(Attribute::NoAlias);
+  ParamAttrs.addAlignmentAttr(Alignment);
+  ParamAttrs.addDereferenceableAttr(Size);
+  Attrs = Attrs.addParamAttributes(Context, ParamIndex, ParamAttrs);
+}
+
 /// Clone the body of the original function into a resume function of
 /// some sort.
 void CoroCloner::create() {
@@ -684,6 +703,9 @@ void CoroCloner::create() {
     // original function.  This should include optimization settings and so on.
     NewAttrs = NewAttrs.addAttributes(Context, AttributeList::FunctionIndex,
                                       OrigAttrs.getFnAttributes());
+
+    addFramePointerAttrs(NewAttrs, Context, 0,
+                         Shape.FrameSize, Shape.FrameAlign);
     break;
 
   case coro::ABI::Retcon:
@@ -691,13 +713,13 @@ void CoroCloner::create() {
     // If we have a continuation prototype, just use its attributes,
     // full-stop.
     NewAttrs = Shape.RetconLowering.ResumePrototype->getAttributes();
+
+    addFramePointerAttrs(NewAttrs, Context, 0,
+                         Shape.getRetconCoroId()->getStorageSize(),
+                         Shape.getRetconCoroId()->getStorageAlignment());
     break;
   }
 
-  // Make the frame parameter nonnull and noalias.
-  NewAttrs = NewAttrs.addParamAttribute(Context, 0, Attribute::NonNull);
-  NewAttrs = NewAttrs.addParamAttribute(Context, 0, Attribute::NoAlias);
-
   switch (Shape.ABI) {
   // In these ABIs, the cloned functions always return 'void', and the
   // existing return sites are meaningless.  Note that for unique
@@ -872,7 +894,8 @@ static void postSplitCleanup(Function &F) {
   // For now, we do a mandatory verification step because we don't
   // entirely trust this pass.  Note that we don't want to add a verifier
   // pass to FPM below because it will also verify all the global data.
-  verifyFunction(F);
+  if (verifyFunction(F, &errs()))
+    report_fatal_error("Broken function");
 
   legacy::FunctionPassManager FPM(F.getParent());
 
@@ -911,17 +934,14 @@ static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
   BasicBlock *UnconditionalSucc = nullptr;
 
   Instruction *I = InitialInst;
-  while (I->isTerminator()) {
+  while (I->isTerminator() ||
+         (isa<CmpInst>(I) && I->getNextNode()->isTerminator())) {
     if (isa<ReturnInst>(I)) {
       if (I != InitialInst) {
         // If InitialInst is an unconditional branch,
         // remove PHI values that come from basic block of InitialInst
         if (UnconditionalSucc)
-          for (PHINode &PN : UnconditionalSucc->phis()) {
-            int idx = PN.getBasicBlockIndex(InitialInst->getParent());
-            if (idx != -1)
-              PN.removeIncomingValue(idx);
-          }
+          UnconditionalSucc->removePredecessor(InitialInst->getParent(), true);
         ReplaceInstWithInst(InitialInst, I->clone());
       }
       return true;
@@ -935,6 +955,29 @@ static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
         I = BB->getFirstNonPHIOrDbgOrLifetime();
         continue;
       }
+    } else if (auto *CondCmp = dyn_cast<CmpInst>(I)) {
+      auto *BR = dyn_cast<BranchInst>(I->getNextNode());
+      if (BR && BR->isConditional() && CondCmp == BR->getCondition()) {
+        // If the case number of suspended switch instruction is reduced to
+        // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator.
+        // And the comparsion looks like : %cond = icmp eq i8 %V, constant.
+        ConstantInt *CondConst = dyn_cast<ConstantInt>(CondCmp->getOperand(1));
+        if (CondConst && CondCmp->getPredicate() == CmpInst::ICMP_EQ) {
+          Value *V = CondCmp->getOperand(0);
+          auto it = ResolvedValues.find(V);
+          if (it != ResolvedValues.end())
+            V = it->second;
+
+          if (ConstantInt *Cond0 = dyn_cast<ConstantInt>(V)) {
+            BasicBlock *BB = Cond0->equalsInt(CondConst->getZExtValue())
+                                 ? BR->getSuccessor(0)
+                                 : BR->getSuccessor(1);
+            scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
+            I = BB->getFirstNonPHIOrDbgOrLifetime();
+            continue;
+          }
+        }
+      }
     } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
       Value *V = SI->getCondition();
       auto it = ResolvedValues.find(V);
@@ -952,6 +995,37 @@ static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
   return false;
 }
 
+// Check whether CI obeys the rules of musttail attribute.
+static bool shouldBeMustTail(const CallInst &CI, const Function &F) {
+  if (CI.isInlineAsm())
+    return false;
+
+  // Match prototypes and calling conventions of resume function.
+  FunctionType *CalleeTy = CI.getFunctionType();
+  if (!CalleeTy->getReturnType()->isVoidTy() || (CalleeTy->getNumParams() != 1))
+    return false;
+
+  Type *CalleeParmTy = CalleeTy->getParamType(0);
+  if (!CalleeParmTy->isPointerTy() ||
+      (CalleeParmTy->getPointerAddressSpace() != 0))
+    return false;
+
+  if (CI.getCallingConv() != F.getCallingConv())
+    return false;
+
+  // CI should not has any ABI-impacting function attributes.
+  static const Attribute::AttrKind ABIAttrs[] = {
+      Attribute::StructRet,    Attribute::ByVal,     Attribute::InAlloca,
+      Attribute::Preallocated, Attribute::InReg,     Attribute::Returned,
+      Attribute::SwiftSelf,    Attribute::SwiftError};
+  AttributeList Attrs = CI.getAttributes();
+  for (auto AK : ABIAttrs)
+    if (Attrs.hasParamAttribute(0, AK))
+      return false;
+
+  return true;
+}
+
 // Add musttail to any resume instructions that is immediately followed by a
 // suspend (i.e. ret). We do this even in -O0 to support guaranteed tail call
 // for symmetrical coroutine control transfer (C++ Coroutines TS extension).
@@ -964,11 +1038,8 @@ static void addMustTailToCoroResumes(Function &F) {
   SmallVector<CallInst *, 4> Resumes;
   for (auto &I : instructions(F))
     if (auto *Call = dyn_cast<CallInst>(&I))
-      if (auto *CalledValue = Call->getCalledValue())
-        // CoroEarly pass replaced coro resumes with indirect calls to an
-        // address return by CoroSubFnInst intrinsic. See if it is one of those.
-        if (isa<CoroSubFnInst>(CalledValue->stripPointerCasts()))
-          Resumes.push_back(Call);
+      if (shouldBeMustTail(*Call, F))
+        Resumes.push_back(Call);
 
   // Set musttail on those that are followed by a ret instruction.
   for (CallInst *Call : Resumes)
@@ -993,8 +1064,8 @@ static void handleNoSuspendCoroutine(coro::Shape &Shape) {
     coro::replaceCoroFree(SwitchId, /*Elide=*/AllocInst != nullptr);
     if (AllocInst) {
       IRBuilder<> Builder(AllocInst);
-      // FIXME: Need to handle overaligned members.
       auto *Frame = Builder.CreateAlloca(Shape.FrameTy);
+      Frame->setAlignment(Shape.FrameAlign);
       auto *VFrame = Builder.CreateBitCast(Frame, Builder.getInt8PtrTy());
       AllocInst->replaceAllUsesWith(Builder.getFalse());
       AllocInst->eraseFromParent();
@@ -1023,7 +1094,7 @@ static bool hasCallsInBlockBetween(Instruction *From, Instruction *To) {
     if (isa<IntrinsicInst>(I))
       continue;
 
-    if (CallSite(I))
+    if (isa<CallBase>(I))
       return true;
   }
   return false;
@@ -1093,13 +1164,11 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
     Prev = Pred->getTerminator();
   }
 
-  CallSite CS{Prev};
-  if (!CS)
+  CallBase *CB = dyn_cast<CallBase>(Prev);
+  if (!CB)
     return false;
 
-  auto *CallInstr = CS.getInstruction();
-
-  auto *Callee = CS.getCalledValue()->stripPointerCasts();
+  auto *Callee = CB->getCalledOperand()->stripPointerCasts();
 
   // See if the callsite is for resumption or destruction of the coroutine.
   auto *SubFn = dyn_cast<CoroSubFnInst>(Callee);
@@ -1114,7 +1183,7 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
   // calls in between Save and CallInstr. They can potenitally resume the
   // coroutine rendering this optimization unsafe.
   auto *Save = Suspend->getCoroSave();
-  if (hasCallsBetween(Save, CallInstr))
+  if (hasCallsBetween(Save, CB))
     return false;
 
   // Replace llvm.coro.suspend with the value that results in resumption over
@@ -1124,13 +1193,13 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
   Save->eraseFromParent();
 
   // No longer need a call to coro.resume or coro.destroy.
-  if (auto *Invoke = dyn_cast<InvokeInst>(CallInstr)) {
+  if (auto *Invoke = dyn_cast<InvokeInst>(CB)) {
     BranchInst::Create(Invoke->getNormalDest(), Invoke);
   }
 
-  // Grab the CalledValue from CS before erasing the CallInstr.
-  auto *CalledValue = CS.getCalledValue();
-  CallInstr->eraseFromParent();
+  // Grab the CalledValue from CB before erasing the CallInstr.
+  auto *CalledValue = CB->getCalledOperand();
+  CB->eraseFromParent();
 
   // If no more users remove it. Usually it is a bitcast of SubFn.
   if (CalledValue != SubFn && CalledValue->user_empty())
@@ -1155,7 +1224,10 @@ static void simplifySuspendPoints(coro::Shape &Shape) {
   if (N == 0)
     return;
   while (true) {
-    if (simplifySuspendPoint(cast<CoroSuspendInst>(S[I]), Shape.CoroBegin)) {
+    auto SI = cast<CoroSuspendInst>(S[I]);
+    // Leave final.suspend to handleFinalSuspend since it is undefined behavior
+    // to resume a coroutine suspended at the final suspend point.
+    if (!SI->isFinal() && simplifySuspendPoint(SI, Shape.CoroBegin)) {
       if (--N == I)
         break;
       std::swap(S[I], S[N]);
@@ -1225,6 +1297,7 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
 
     // Allocate.  We don't need to update the call graph node because we're
     // going to recompute it from scratch after splitting.
+    // FIXME: pass the required alignment
     RawFramePtr = Shape.emitAlloc(Builder, Builder.getInt64(Size), nullptr);
     RawFramePtr =
       Builder.CreateBitCast(RawFramePtr, Shape.CoroBegin->getType());
@@ -1342,19 +1415,8 @@ namespace {
   };
 }
 
-static void splitCoroutine(Function &F, coro::Shape &Shape,
-                           SmallVectorImpl<Function *> &Clones) {
-  switch (Shape.ABI) {
-  case coro::ABI::Switch:
-    return splitSwitchCoroutine(F, Shape, Clones);
-  case coro::ABI::Retcon:
-  case coro::ABI::RetconOnce:
-    return splitRetconCoroutine(F, Shape, Clones);
-  }
-  llvm_unreachable("bad ABI kind");
-}
-
-static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
+static coro::Shape splitCoroutine(Function &F,
+                                  SmallVectorImpl<Function *> &Clones) {
   PrettyStackTraceFunction prettyStackTrace(F);
 
   // The suspend-crossing algorithm in buildCoroutineFrame get tripped
@@ -1363,26 +1425,42 @@ static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
 
   coro::Shape Shape(F);
   if (!Shape.CoroBegin)
-    return;
+    return Shape;
 
   simplifySuspendPoints(Shape);
   buildCoroutineFrame(F, Shape);
   replaceFrameSize(Shape);
 
-  SmallVector<Function*, 4> Clones;
-
   // If there are no suspend points, no split required, just remove
   // the allocation and deallocation blocks, they are not needed.
   if (Shape.CoroSuspends.empty()) {
     handleNoSuspendCoroutine(Shape);
   } else {
-    splitCoroutine(F, Shape, Clones);
+    switch (Shape.ABI) {
+    case coro::ABI::Switch:
+      splitSwitchCoroutine(F, Shape, Clones);
+      break;
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      splitRetconCoroutine(F, Shape, Clones);
+      break;
+    }
   }
 
   // Replace all the swifterror operations in the original function.
   // This invalidates SwiftErrorOps in the Shape.
   replaceSwiftErrorOps(F, Shape, nullptr);
 
+  return Shape;
+}
+
+static void
+updateCallGraphAfterCoroutineSplit(Function &F, const coro::Shape &Shape,
+                                   const SmallVectorImpl<Function *> &Clones,
+                                   CallGraph &CG, CallGraphSCC &SCC) {
+  if (!Shape.CoroBegin)
+    return;
+
   removeCoroEnds(Shape, &CG);
   postSplitCleanup(F);
 
@@ -1390,6 +1468,44 @@ static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
   coro::updateCallGraph(F, Clones, CG, SCC);
 }
 
+static void updateCallGraphAfterCoroutineSplit(
+    LazyCallGraph::Node &N, const coro::Shape &Shape,
+    const SmallVectorImpl<Function *> &Clones, LazyCallGraph::SCC &C,
+    LazyCallGraph &CG, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR,
+    FunctionAnalysisManager &FAM) {
+  if (!Shape.CoroBegin)
+    return;
+
+  for (llvm::CoroEndInst *End : Shape.CoroEnds) {
+    auto &Context = End->getContext();
+    End->replaceAllUsesWith(ConstantInt::getFalse(Context));
+    End->eraseFromParent();
+  }
+
+  postSplitCleanup(N.getFunction());
+
+  // To insert the newly created coroutine funclets 'f.resume', 'f.destroy', and
+  // 'f.cleanup' into the same SCC as the coroutine 'f' they were outlined from,
+  // we make use of the CallGraphUpdater class, which can modify the internal
+  // state of the LazyCallGraph.
+  for (Function *Clone : Clones)
+    CG.addNewFunctionIntoRefSCC(*Clone, C.getOuterRefSCC());
+
+  // We've inserted instructions into coroutine 'f' that reference the three new
+  // coroutine funclets. We must now update the call graph so that reference
+  // edges between 'f' and its funclets are added to it. LazyCallGraph only
+  // allows CGSCC passes to insert "trivial" reference edges. We've ensured
+  // above, by inserting the funclets into the same SCC as the corutine, that
+  // the edges are trivial.
+  //
+  // N.B.: If we didn't update the call graph here, a CGSCCToFunctionPassAdaptor
+  // later in this CGSCC pass pipeline may be run, triggering a call graph
+  // update of its own. Function passes run by the adaptor are not permitted to
+  // add new edges of any kind to the graph, and the new edges inserted by this
+  // pass would be misattributed to that unrelated function pass.
+  updateCGAndAnalysisManagerForCGSCCPass(CG, C, N, AM, UR, FAM);
+}
+
 // When we see the coroutine the first time, we insert an indirect call to a
 // devirt trigger function and mark the coroutine that it is now ready for
 // split.
@@ -1521,12 +1637,89 @@ static bool replaceAllPrepares(Function *PrepareFn, CallGraph &CG) {
   return Changed;
 }
 
-//===----------------------------------------------------------------------===//
-//                              Top Level Driver
-//===----------------------------------------------------------------------===//
+static bool declaresCoroSplitIntrinsics(const Module &M) {
+  return coro::declaresIntrinsics(
+      M, {"llvm.coro.begin", "llvm.coro.prepare.retcon"});
+}
+
+PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
+                                     CGSCCAnalysisManager &AM,
+                                     LazyCallGraph &CG, CGSCCUpdateResult &UR) {
+  // NB: One invariant of a valid LazyCallGraph::SCC is that it must contain a
+  //     non-zero number of nodes, so we assume that here and grab the first
+  //     node's function's module.
+  Module &M = *C.begin()->getFunction().getParent();
+  auto &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
+  if (!declaresCoroSplitIntrinsics(M))
+    return PreservedAnalyses::all();
+
+  // Check for uses of llvm.coro.prepare.retcon.
+  const auto *PrepareFn = M.getFunction("llvm.coro.prepare.retcon");
+  if (PrepareFn && PrepareFn->use_empty())
+    PrepareFn = nullptr;
+
+  // Find coroutines for processing.
+  SmallVector<LazyCallGraph::Node *, 4> Coroutines;
+  for (LazyCallGraph::Node &N : C)
+    if (N.getFunction().hasFnAttribute(CORO_PRESPLIT_ATTR))
+      Coroutines.push_back(&N);
+
+  if (Coroutines.empty() && !PrepareFn)
+    return PreservedAnalyses::all();
+
+  if (Coroutines.empty())
+    llvm_unreachable("new pass manager cannot yet handle "
+                     "'llvm.coro.prepare.retcon'");
+
+  // Split all the coroutines.
+  for (LazyCallGraph::Node *N : Coroutines) {
+    Function &F = N->getFunction();
+    Attribute Attr = F.getFnAttribute(CORO_PRESPLIT_ATTR);
+    StringRef Value = Attr.getValueAsString();
+    LLVM_DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F.getName()
+                      << "' state: " << Value << "\n");
+    if (Value == UNPREPARED_FOR_SPLIT) {
+      // Enqueue a second iteration of the CGSCC pipeline.
+      // N.B.:
+      // The CoroSplitLegacy pass "triggers" a restart of the CGSCC pass
+      // pipeline by inserting an indirect function call that the
+      // CoroElideLegacy pass then replaces with a direct function call. The
+      // legacy CGSCC pipeline's implicit behavior was as if wrapped in the new
+      // pass manager abstraction DevirtSCCRepeatedPass.
+      //
+      // This pass does not need to "trigger" another run of the pipeline.
+      // Instead, it simply enqueues the same RefSCC onto the pipeline's
+      // worklist.
+      UR.CWorklist.insert(&C);
+      F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
+      continue;
+    }
+    F.removeFnAttr(CORO_PRESPLIT_ATTR);
+
+    SmallVector<Function *, 4> Clones;
+    const coro::Shape Shape = splitCoroutine(F, Clones);
+    updateCallGraphAfterCoroutineSplit(*N, Shape, Clones, C, CG, AM, UR, FAM);
+  }
+
+  if (PrepareFn)
+    llvm_unreachable("new pass manager cannot yet handle "
+                     "'llvm.coro.prepare.retcon'");
+
+  return PreservedAnalyses::none();
+}
 
 namespace {
 
+// We present a coroutine to LLVM as an ordinary function with suspension
+// points marked up with intrinsics. We let the optimizer party on the coroutine
+// as a single function for as long as possible. Shortly before the coroutine is
+// eligible to be inlined into its callers, we split up the coroutine into parts
+// corresponding to initial, resume and destroy invocations of the coroutine,
+// add them to the current SCC and restart the IPO pipeline to optimize the
+// coroutine subfunctions we extracted before proceeding to the caller of the
+// coroutine.
 struct CoroSplitLegacy : public CallGraphSCCPass {
   static char ID; // Pass identification, replacement for typeid
 
@@ -1539,9 +1732,7 @@ struct CoroSplitLegacy : public CallGraphSCCPass {
   // A coroutine is identified by the presence of coro.begin intrinsic, if
   // we don't have any, this pass has nothing to do.
   bool doInitialization(CallGraph &CG) override {
-    Run = coro::declaresIntrinsics(CG.getModule(),
-                                   {"llvm.coro.begin",
-                                    "llvm.coro.prepare.retcon"});
+    Run = declaresCoroSplitIntrinsics(CG.getModule());
     return CallGraphSCCPass::doInitialization(CG);
   }
 
@@ -1583,7 +1774,10 @@ struct CoroSplitLegacy : public CallGraphSCCPass {
         continue;
       }
       F->removeFnAttr(CORO_PRESPLIT_ATTR);
-      splitCoroutine(*F, CG, SCC);
+
+      SmallVector<Function *, 4> Clones;
+      const coro::Shape Shape = splitCoroutine(*F, Clones);
+      updateCallGraphAfterCoroutineSplit(*F, Shape, Clones, CG, SCC);
     }
 
     if (PrepareFn)
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 02d11af3303f..87c3a8b0d0cf 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -649,3 +648,9 @@ void LLVMAddCoroElidePass(LLVMPassManagerRef PM) {
 void LLVMAddCoroCleanupPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCoroCleanupLegacyPass());
 }
+
+void
+LLVMPassManagerBuilderAddCoroutinePassesToExtensionPoints(LLVMPassManagerBuilderRef PMB) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  addCoroutinePassesToExtensionPoints(*Builder);
+}