1 files changed, 331 insertions, 243 deletions
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index a5a4f91299f3..c0d358ead278 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -17,7 +17,6 @@
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -42,6 +41,7 @@
 STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
 STATISTIC(NumFrameExtraProbe,
           "Number of extra stack probes generated in prologue");
+STATISTIC(NumFunctionUsingPush2Pop2, "Number of funtions using push2/pop2");
 
 using namespace llvm;
 
@@ -69,8 +69,8 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 /// call frame pseudos can be simplified.  Having a FP, as in the default
 /// implementation, is not sufficient here since we can't always use it.
 /// Use a more nuanced condition.
-bool
-X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+bool X86FrameLowering::canSimplifyCallFramePseudos(
+    const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) ||
          MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
          (hasFP(MF) && !TRI->hasStackRealignment(MF)) ||
@@ -84,8 +84,8 @@ X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
 // that were not simplified earlier.
 // So, this is required for x86 functions that have push sequences even
 // when there are no stack objects.
-bool
-X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+bool X86FrameLowering::needsFrameIndexResolution(
+    const MachineFunction &MF) const {
   return MF.getFrameInfo().hasStackObjects() ||
          MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
 }
@@ -140,6 +140,38 @@ static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) {
   return X86::MOV32ri;
 }
 
+// Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the
+// value written by the PUSH from the stack. The processor tracks these marked
+// instructions internally and fast-forwards register data between matching PUSH
+// and POP instructions, without going through memory or through the training
+// loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient
+// memory-renaming optimization can be used.
+//
+// The PPX hint is purely a performance hint. Instructions with this hint have
+// the same functional semantics as those without. PPX hints set by the
+// compiler that violate the balancing rule may turn off the PPX optimization,
+// but they will not affect program semantics.
+//
+// Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp
+// are not considered).
+//
+// PUSH2 and POP2 are instructions for (respectively) pushing/popping 2
+// GPRs at a time to/from the stack.
+static unsigned getPUSHOpcode(const X86Subtarget &ST) {
+  return ST.is64Bit() ? (ST.hasPPX() ? X86::PUSHP64r : X86::PUSH64r)
+                      : X86::PUSH32r;
+}
+static unsigned getPOPOpcode(const X86Subtarget &ST) {
+  return ST.is64Bit() ? (ST.hasPPX() ? X86::POPP64r : X86::POP64r)
+                      : X86::POP32r;
+}
+static unsigned getPUSH2Opcode(const X86Subtarget &ST) {
+  return ST.hasPPX() ? X86::PUSH2P : X86::PUSH2;
+}
+static unsigned getPOP2Opcode(const X86Subtarget &ST) {
+  return ST.hasPPX() ? X86::POP2P : X86::POP2;
+}
+
 static bool isEAXLiveIn(MachineBasicBlock &MBB) {
   for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
     unsigned Reg = RegMask.PhysReg;
@@ -195,8 +227,8 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
 /// stack pointer by a constant value.
 void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator &MBBI,
-                                    const DebugLoc &DL,
-                                    int64_t NumBytes, bool InEpilogue) const {
+                                    const DebugLoc &DL, int64_t NumBytes,
+                                    bool InEpilogue) const {
   bool isSub = NumBytes < 0;
   uint64_t Offset = isSub ? -NumBytes : NumBytes;
   MachineInstr::MIFlag Flag =
@@ -280,13 +312,11 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
     if (ThisVal == SlotSize) {
       // Use push / pop for slot sized adjustments as a size optimization. We
       // need to find a dead register when using pop.
-      unsigned Reg = isSub
-        ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
-        : TRI->findDeadCallerSavedReg(MBB, MBBI);
+      unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
+                           : TRI->findDeadCallerSavedReg(MBB, MBBI);
       if (Reg) {
-        unsigned Opc = isSub
-          ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
-          : (Is64Bit ? X86::POP64r  : X86::POP32r);
+        unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
+                             : (Is64Bit ? X86::POP64r : X86::POP32r);
         BuildMI(MBB, MBBI, DL, TII.get(Opc))
             .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub))
             .setMIFlag(Flag);
@@ -562,49 +592,13 @@ void X86FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
       RegsToZero.reset(Reg);
     }
 
+  // Zero out the GPRs first.
   for (MCRegister Reg : GPRsToZero.set_bits())
-    BuildMI(MBB, MBBI, DL, TII.get(X86::XOR32rr), Reg)
-        .addReg(Reg, RegState::Undef)
-        .addReg(Reg, RegState::Undef);
-
-  // Zero out registers.
-  for (MCRegister Reg : RegsToZero.set_bits()) {
-    if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
-      // FIXME: Ignore MMX registers?
-      continue;
+    TII.buildClearRegister(Reg, MBB, MBBI, DL);
 
-    unsigned XorOp;
-    if (X86::VR128RegClass.contains(Reg)) {
-      // XMM#
-      if (!ST.hasSSE1())
-        continue;
-      XorOp = X86::PXORrr;
-    } else if (X86::VR256RegClass.contains(Reg)) {
-      // YMM#
-      if (!ST.hasAVX())
-        continue;
-      XorOp = X86::VPXORrr;
-    } else if (X86::VR512RegClass.contains(Reg)) {
-      // ZMM#
-      if (!ST.hasAVX512())
-        continue;
-      XorOp = X86::VPXORYrr;
-    } else if (X86::VK1RegClass.contains(Reg) ||
-               X86::VK2RegClass.contains(Reg) ||
-               X86::VK4RegClass.contains(Reg) ||
-               X86::VK8RegClass.contains(Reg) ||
-               X86::VK16RegClass.contains(Reg)) {
-      if (!ST.hasVLX())
-        continue;
-      XorOp = ST.hasBWI() ? X86::KXORQrr : X86::KXORWrr;
-    } else {
-      continue;
-    }
-
-    BuildMI(MBB, MBBI, DL, TII.get(XorOp), Reg)
-      .addReg(Reg, RegState::Undef)
-      .addReg(Reg, RegState::Undef);
-  }
+  // Zero out the remaining registers.
+  for (MCRegister Reg : RegsToZero.set_bits())
+    TII.buildClearRegister(Reg, MBB, MBBI, DL);
 }
 
 void X86FrameLowering::emitStackProbe(
@@ -959,24 +953,16 @@ void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
   // registers. For the prolog expansion we use RAX, RCX and RDX.
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetRegisterClass *RegClass = &X86::GR64RegClass;
-  const Register SizeReg = InProlog ? X86::RAX
-                                    : MRI.createVirtualRegister(RegClass),
-                 ZeroReg = InProlog ? X86::RCX
-                                    : MRI.createVirtualRegister(RegClass),
-                 CopyReg = InProlog ? X86::RDX
-                                    : MRI.createVirtualRegister(RegClass),
-                 TestReg = InProlog ? X86::RDX
-                                    : MRI.createVirtualRegister(RegClass),
-                 FinalReg = InProlog ? X86::RDX
-                                     : MRI.createVirtualRegister(RegClass),
-                 RoundedReg = InProlog ? X86::RDX
-                                       : MRI.createVirtualRegister(RegClass),
-                 LimitReg = InProlog ? X86::RCX
-                                     : MRI.createVirtualRegister(RegClass),
-                 JoinReg = InProlog ? X86::RCX
-                                    : MRI.createVirtualRegister(RegClass),
-                 ProbeReg = InProlog ? X86::RCX
-                                     : MRI.createVirtualRegister(RegClass);
+  const Register
+      SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass),
+      ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
+      CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+      TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+      FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+      RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+      LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
+      JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
+      ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass);
 
   // SP-relative offsets where we can save RCX and RDX.
   int64_t RCXShadowSlot = 0;
@@ -1048,7 +1034,9 @@ void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
       .addReg(X86::GS);
   BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
   // Jump if the desired stack pointer is at or above the stack limit.
-  BuildMI(&MBB, DL, TII.get(X86::JCC_1)).addMBB(ContinueMBB).addImm(X86::COND_AE);
+  BuildMI(&MBB, DL, TII.get(X86::JCC_1))
+      .addMBB(ContinueMBB)
+      .addImm(X86::COND_AE);
 
   // Add code to roundMBB to round the final stack pointer to a page boundary.
   RoundMBB->addLiveIn(FinalReg);
@@ -1085,7 +1073,9 @@ void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
   BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
       .addReg(RoundedReg)
       .addReg(ProbeReg);
-  BuildMI(LoopMBB, DL, TII.get(X86::JCC_1)).addMBB(LoopMBB).addImm(X86::COND_NE);
+  BuildMI(LoopMBB, DL, TII.get(X86::JCC_1))
+      .addMBB(LoopMBB)
+      .addImm(X86::COND_NE);
 
   MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
 
@@ -1169,7 +1159,7 @@ void X86FrameLowering::emitStackProbeCall(
     CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
   } else {
     CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
-        .addExternalSymbol(MF.createExternalSymbolName(Symbol));
+             .addExternalSymbol(MF.createExternalSymbolName(Symbol));
   }
 
   unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX;
@@ -1231,7 +1221,8 @@ static unsigned calculateSetFPREG(uint64_t SPAdjust) {
 // info, we need to know the ABI stack alignment as well in case we
 // have a call out.  Otherwise just make sure we have some alignment - we'll
 // go with the minimum SlotSize.
-uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
+uint64_t
+X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment.
   Align StackAlign = getStackAlign();
@@ -1322,8 +1313,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
       // Loop entry block
 
       {
-        const unsigned SUBOpc =
-            getSUBriOpcode(Uses64BitFramePtr);
+        const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);
         BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr)
             .addReg(StackPtr)
             .addImm(StackProbeSize)
@@ -1353,8 +1343,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
             .addImm(0)
             .setMIFlag(MachineInstr::FrameSetup);
 
-        const unsigned SUBOpc =
-            getSUBriOpcode(Uses64BitFramePtr);
+        const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr);
         BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr)
             .addReg(StackPtr)
             .addImm(StackProbeSize)
@@ -1405,7 +1394,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
   }
 }
 
-bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
+bool X86FrameLowering::has128ByteRedZone(const MachineFunction &MF) const {
   // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
   // clobbered by any interrupt handler.
   assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
@@ -1521,7 +1510,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   MachineModuleInfo &MMI = MF.getMMI();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
-  uint64_t StackSize = MFI.getStackSize();    // Number of bytes to allocate.
+  uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
   bool IsFunclet = MBB.isEHFuncletEntry();
   EHPersonality Personality = EHPersonality::Unknown;
   if (Fn.hasPersonalityFn())
@@ -1539,8 +1528,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   bool NeedsDwarfCFI = needsDwarfCFI(MF);
   Register FramePtr = TRI->getFrameRegister(MF);
   const Register MachineFramePtr =
-      STI.isTarget64BitILP32()
-          ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
+      STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
+                               : FramePtr;
   Register BasePtr = TRI->getBaseRegister();
   bool HasWinCFI = false;
 
@@ -1575,7 +1564,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     }
     BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
     int64_t Offset = -(int64_t)SlotSize;
-    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64rmm: X86::PUSH32rmm))
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm))
         .addReg(ArgBaseReg)
         .addImm(1)
         .addReg(X86::NoRegister)
@@ -1587,7 +1576,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // Space reserved for stack-based arguments when making a (ABI-guaranteed)
   // tail call.
   unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta();
-  if (TailCallArgReserveSize  && IsWin64Prologue)
+  if (TailCallArgReserveSize && IsWin64Prologue)
     report_fatal_error("Can't handle guaranteed tail call under win64 yet");
 
   const bool EmitStackProbeCall =
@@ -1659,7 +1648,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       !MF.shouldSplitStack()) {                // Regular stack
     uint64_t MinSize =
         X86FI->getCalleeSavedFrameSize() - X86FI->getTCReturnAddrDelta();
-    if (HasFP) MinSize += SlotSize;
+    if (HasFP)
+      MinSize += SlotSize;
     X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
     MFI.setStackSize(StackSize);
@@ -1714,17 +1704,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
 
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
-    NumBytes = FrameSize -
-               (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
+    NumBytes =
+        FrameSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
 
     // Callee-saved registers are pushed on stack before the stack is realigned.
     if (TRI->hasStackRealignment(MF) && !IsWin64Prologue)
       NumBytes = alignTo(NumBytes, MaxAlign);
 
     // Save EBP/RBP into the appropriate stack slot.
-    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
-      .addReg(MachineFramePtr, RegState::Kill)
-      .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL,
+            TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))
+        .addReg(MachineFramePtr, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
 
     if (NeedsDwarfCFI && !ArgBaseReg.isValid()) {
       // Mark the place where EBP/RBP was saved.
@@ -1839,8 +1830,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     }
   } else {
     assert(!IsFunclet && "funclets without FPs not yet implemented");
-    NumBytes = StackSize -
-               (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
+    NumBytes =
+        StackSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize);
   }
 
   // Update the offset adjustment, which is mainly used by codeview to translate
@@ -1861,19 +1852,30 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // Skip the callee-saved push instructions.
   bool PushedRegs = false;
   int StackOffset = 2 * stackGrowth;
+  MachineBasicBlock::const_iterator LastCSPush = MBBI;
+  auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) {
+    if (MBBI == MBB.end() || !MBBI->getFlag(MachineInstr::FrameSetup))
+      return false;
+    unsigned Opc = MBBI->getOpcode();
+    return Opc == X86::PUSH32r || Opc == X86::PUSH64r || Opc == X86::PUSHP64r ||
+           Opc == X86::PUSH2 || Opc == X86::PUSH2P;
+  };
 
-  while (MBBI != MBB.end() &&
-         MBBI->getFlag(MachineInstr::FrameSetup) &&
-         (MBBI->getOpcode() == X86::PUSH32r ||
-          MBBI->getOpcode() == X86::PUSH64r)) {
+  while (IsCSPush(MBBI)) {
     PushedRegs = true;
     Register Reg = MBBI->getOperand(0).getReg();
+    LastCSPush = MBBI;
     ++MBBI;
+    unsigned Opc = LastCSPush->getOpcode();
 
     if (!HasFP && NeedsDwarfCFI) {
       // Mark callee-saved push instruction.
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
+      // Compared to push, push2 introduces more stack offset (one more
+      // register).
+      if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
+        StackOffset += stackGrowth;
       BuildCFI(MBB, MBBI, DL,
                MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset),
                MachineInstr::FrameSetup);
@@ -1885,6 +1887,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
           .addImm(Reg)
           .setMIFlag(MachineInstr::FrameSetup);
+      if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
+        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+            .addImm(LastCSPush->getOperand(1).getReg())
+            .setMIFlag(MachineInstr::FrameSetup);
     }
   }
 
@@ -1933,13 +1939,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       if (Is64Bit) {
         // Save RAX
         BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
-          .addReg(X86::RAX, RegState::Kill)
-          .setMIFlag(MachineInstr::FrameSetup);
+            .addReg(X86::RAX, RegState::Kill)
+            .setMIFlag(MachineInstr::FrameSetup);
       } else {
         // Save EAX
         BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
-          .addReg(X86::EAX, RegState::Kill)
-          .setMIFlag(MachineInstr::FrameSetup);
+            .addReg(X86::EAX, RegState::Kill)
+            .setMIFlag(MachineInstr::FrameSetup);
       }
     }
 
@@ -2122,16 +2128,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     // Update the base pointer with the current stack pointer.
     unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
     BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
-      .addReg(SPOrEstablisher)
-      .setMIFlag(MachineInstr::FrameSetup);
+        .addReg(SPOrEstablisher)
+        .setMIFlag(MachineInstr::FrameSetup);
     if (X86FI->getRestoreBasePointer()) {
       // Stash value of base pointer.  Saving RSP instead of EBP shortens
       // dependence chain. Used by SjLj EH.
       unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
-      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
-                   FramePtr, true, X86FI->getRestoreBasePointerOffset())
-        .addReg(SPOrEstablisher)
-        .setMIFlag(MachineInstr::FrameSetup);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true,
+                   X86FI->getRestoreBasePointerOffset())
+          .addReg(SPOrEstablisher)
+          .setMIFlag(MachineInstr::FrameSetup);
     }
 
     if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
@@ -2244,9 +2250,9 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
   // This is the size of the pushed CSRs.
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   // This is the size of callee saved XMMs.
-  const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
-  unsigned XMMSize = WinEHXMMSlotInfo.size() *
-                     TRI->getSpillSize(X86::VR128RegClass);
+  const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+  unsigned XMMSize =
+      WinEHXMMSlotInfo.size() * TRI->getSpillSize(X86::VR128RegClass);
   // This is the amount of stack a funclet needs to allocate.
   unsigned UsedSize;
   EHPersonality Personality =
@@ -2270,10 +2276,9 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
 }
 
 static bool isTailCallOpcode(unsigned Opc) {
-    return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
-        Opc == X86::TCRETURNmi ||
-        Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNdi64 ||
-        Opc == X86::TCRETURNmi64;
+  return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
+         Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
+         Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64;
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
@@ -2359,18 +2364,18 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     if (X86FI->hasSwiftAsyncContext()) {
       // Discard the context.
       int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
-      emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/true);
+      emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
     }
     // Pop EBP.
-    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+    BuildMI(MBB, MBBI, DL,
+            TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())),
             MachineFramePtr)
         .setMIFlag(MachineInstr::FrameDestroy);
 
     // We need to reset FP to its untagged state on return. Bit 60 is currently
     // used to show the presence of an extended frame.
     if (X86FI->hasSwiftAsyncContext()) {
-      BuildMI(MBB, MBBI, DL, TII.get(X86::BTR64ri8),
-              MachineFramePtr)
+      BuildMI(MBB, MBBI, DL, TII.get(X86::BTR64ri8), MachineFramePtr)
           .addUse(MachineFramePtr)
           .addImm(60)
           .setMIFlag(MachineInstr::FrameDestroy);
@@ -2403,10 +2408,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     unsigned Opc = PI->getOpcode();
 
     if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
-      if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
-          (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
-          (Opc != X86::BTR64ri8 || !PI->getFlag(MachineInstr::FrameDestroy)) &&
-          (Opc != X86::ADD64ri32 || !PI->getFlag(MachineInstr::FrameDestroy)))
+      if (!PI->getFlag(MachineInstr::FrameDestroy) ||
+          (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 &&
+           Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 &&
+           Opc != X86::POP2P && Opc != X86::LEA64r))
         break;
       FirstCSPop = PI;
     }
@@ -2458,13 +2463,12 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     // effects of the prologue can safely be undone.
     if (LEAAmount != 0) {
       unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
-      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
-                   FramePtr, false, LEAAmount);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr,
+                   false, LEAAmount);
       --MBBI;
     } else {
       unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
-      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
-        .addReg(FramePtr);
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr).addReg(FramePtr);
       --MBBI;
     }
   } else if (NumBytes) {
@@ -2498,8 +2502,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
       MachineBasicBlock::iterator PI = MBBI;
       unsigned Opc = PI->getOpcode();
       ++MBBI;
-      if (Opc == X86::POP32r || Opc == X86::POP64r) {
+      if (Opc == X86::POP32r || Opc == X86::POP64r || Opc == X86::POPP64r ||
+          Opc == X86::POP2 || Opc == X86::POP2P) {
         Offset += SlotSize;
+        // Compared to pop, pop2 introduces more stack offset (one more
+        // register).
+        if (Opc == X86::POP2 || Opc == X86::POP2P)
+          Offset += SlotSize;
         BuildCFI(MBB, MBBI, DL,
                  MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset),
                  MachineInstr::FrameDestroy);
@@ -2570,7 +2579,8 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
 
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
-    // If required, include space for extra hidden slot for stashing base pointer.
+    // If required, include space for extra hidden slot for stashing base
+    // pointer.
     if (X86FI->getRestoreBasePointer())
       FrameSize += SlotSize;
     uint64_t NumBytes = FrameSize - CSSize;
@@ -2615,7 +2625,7 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
                                               Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+  const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
   const auto it = WinEHXMMSlotInfo.find(FI);
 
   if (it == WinEHXMMSlotInfo.end())
@@ -2743,7 +2753,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
     //   }
     //   [EBP]
     MFI.CreateFixedObject(-TailCallReturnAddrDelta,
-                           TailCallReturnAddrDelta - SlotSize, true);
+                          TailCallReturnAddrDelta - SlotSize, true);
   }
 
   // Spill the BasePtr if it's used.
@@ -2774,13 +2784,37 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
     // about avoiding it later.
     Register FPReg = TRI->getFrameRegister(MF);
     for (unsigned i = 0; i < CSI.size(); ++i) {
-      if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
+      if (TRI->regsOverlap(CSI[i].getReg(), FPReg)) {
         CSI.erase(CSI.begin() + i);
         break;
       }
     }
   }
 
+  // Strategy:
+  // 1. Use push2 when
+  //       a) number of CSR > 1 if no need padding
+  //       b) number of CSR > 2 if need padding
+  // 2. When the number of CSR push is odd
+  //    a. Start to use push2 from the 1st push if stack is 16B aligned.
+  //    b. Start to use push2 from the 2nd push if stack is not 16B aligned.
+  // 3. When the number of CSR push is even, start to use push2 from the 1st
+  //    push and make the stack 16B aligned before the push
+  unsigned NumRegsForPush2 = 0;
+  if (STI.hasPush2Pop2()) {
+    unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) {
+      return X86::GR64RegClass.contains(I.getReg());
+    });
+    bool NeedPadding = (SpillSlotOffset % 16 != 0) && (NumCSGPR % 2 == 0);
+    bool UsePush2Pop2 = NeedPadding ? NumCSGPR > 2 : NumCSGPR > 1;
+    X86FI->setPadForPush2Pop2(NeedPadding && UsePush2Pop2);
+    NumRegsForPush2 = UsePush2Pop2 ? alignDown(NumCSGPR, 2) : 0;
+    if (X86FI->padForPush2Pop2()) {
+      SpillSlotOffset -= SlotSize;
+      MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+    }
+  }
+
   // Assign slots for GPRs. It increases frame size.
   for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
     Register Reg = I.getReg();
@@ -2788,6 +2822,13 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
     if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
       continue;
 
+    // A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned
+    // or only an odd number of registers in the candidates.
+    if (X86FI->getNumCandidatesForPush2Pop2() < NumRegsForPush2 &&
+        (SpillSlotOffset % 16 == 0 ||
+         X86FI->getNumCandidatesForPush2Pop2() % 2))
+      X86FI->addCandidateForPush2Pop2(Reg);
+
     SpillSlotOffset -= SlotSize;
     CalleeSavedFrameSize += SlotSize;
 
@@ -2805,6 +2846,10 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
     // TODO: saving the slot index is better?
     X86FI->setRestoreBasePointer(CalleeSavedFrameSize);
   }
+  assert(X86FI->getNumCandidatesForPush2Pop2() % 2 == 0 &&
+         "Expect even candidates for push2/pop2");
+  if (X86FI->getNumCandidatesForPush2Pop2())
+    ++NumFunctionUsingPush2Pop2;
   X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
   MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);
 
@@ -2854,40 +2899,50 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
 
   // Push GPRs. It increases frame size.
   const MachineFunction &MF = *MBB.getParent();
-  unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
-  for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
-    Register Reg = I.getReg();
-
-    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
-      continue;
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  if (X86FI->padForPush2Pop2())
+    emitSPUpdate(MBB, MI, DL, -(int64_t)SlotSize, /*InEpilogue=*/false);
 
+  // Update LiveIn of the basic block and decide whether we can add a kill flag
+  // to the use.
+  auto UpdateLiveInCheckCanKill = [&](Register Reg) {
     const MachineRegisterInfo &MRI = MF.getRegInfo();
-    bool isLiveIn = MRI.isLiveIn(Reg);
-    if (!isLiveIn)
-      MBB.addLiveIn(Reg);
-
-    // Decide whether we can add a kill flag to the use.
-    bool CanKill = !isLiveIn;
-    // Check if any subregister is live-in
-    if (CanKill) {
-      for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) {
-        if (MRI.isLiveIn(*AReg)) {
-          CanKill = false;
-          break;
-        }
-      }
-    }
-
     // Do not set a kill flag on values that are also marked as live-in. This
     // happens with the @llvm-returnaddress intrinsic and with arguments
     // passed in callee saved registers.
     // Omitting the kill flags is conservatively correct even if the live-in
     // is not used after all.
-    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, getKillRegState(CanKill))
-      .setMIFlag(MachineInstr::FrameSetup);
+    if (MRI.isLiveIn(Reg))
+      return false;
+    MBB.addLiveIn(Reg);
+    // Check if any subregister is live-in
+    for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg)
+      if (MRI.isLiveIn(*AReg))
+        return false;
+    return true;
+  };
+  auto UpdateLiveInGetKillRegState = [&](Register Reg) {
+    return getKillRegState(UpdateLiveInCheckCanKill(Reg));
+  };
+
+  for (auto RI = CSI.rbegin(), RE = CSI.rend(); RI != RE; ++RI) {
+    Register Reg = RI->getReg();
+    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+      continue;
+
+    if (X86FI->isCandidateForPush2Pop2(Reg)) {
+      Register Reg2 = (++RI)->getReg();
+      BuildMI(MBB, MI, DL, TII.get(getPUSH2Opcode(STI)))
+          .addReg(Reg, UpdateLiveInGetKillRegState(Reg))
+          .addReg(Reg2, UpdateLiveInGetKillRegState(Reg2))
+          .setMIFlag(MachineInstr::FrameSetup);
+    } else {
+      BuildMI(MBB, MI, DL, TII.get(getPUSHOpcode(STI)))
+          .addReg(Reg, UpdateLiveInGetKillRegState(Reg))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
   }
 
-  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   if (X86FI->getRestoreBasePointer()) {
     unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
     Register BaseReg = this->TRI->getBaseRegister();
@@ -2979,8 +3034,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
   // Reload XMMs from stack frame.
   for (const CalleeSavedInfo &I : CSI) {
     Register Reg = I.getReg();
-    if (X86::GR64RegClass.contains(Reg) ||
-        X86::GR32RegClass.contains(Reg))
+    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
       continue;
 
     // If this is k-register make sure we lookup via the largest legal type.
@@ -3004,16 +3058,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
   }
 
   // POP GPRs.
-  unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
-  for (const CalleeSavedInfo &I : CSI) {
-    Register Reg = I.getReg();
-    if (!X86::GR64RegClass.contains(Reg) &&
-        !X86::GR32RegClass.contains(Reg))
+  for (auto I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+    Register Reg = I->getReg();
+    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
       continue;
 
-    BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
-        .setMIFlag(MachineInstr::FrameDestroy);
+    if (X86FI->isCandidateForPush2Pop2(Reg))
+      BuildMI(MBB, MI, DL, TII.get(getPOP2Opcode(STI)), Reg)
+          .addReg((++I)->getReg(), RegState::Define)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    else
+      BuildMI(MBB, MI, DL, TII.get(getPOPOpcode(STI)), Reg)
+          .setMIFlag(MachineInstr::FrameDestroy);
   }
+  if (X86FI->padForPush2Pop2())
+    emitSPUpdate(MBB, MI, DL, SlotSize, /*InEpilogue=*/true);
+
   return true;
 }
 
@@ -3023,7 +3083,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
   // Spill the BasePtr if it's used.
-  if (TRI->hasBasePointer(MF)){
+  if (TRI->hasBasePointer(MF)) {
     Register BasePtr = TRI->getBaseRegister();
     if (STI.isTarget64BitILP32())
       BasePtr = getX86SubSuperRegister(BasePtr, 64);
@@ -3031,11 +3091,10 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
   }
 }
 
-static bool
-HasNestArgument(const MachineFunction *MF) {
+static bool HasNestArgument(const MachineFunction *MF) {
   const Function &F = MF->getFunction();
-  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
-       I != E; I++) {
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
+       I++) {
     if (I->hasNestAttr() && !I->use_empty())
       return true;
   }
@@ -3046,8 +3105,8 @@ HasNestArgument(const MachineFunction *MF) {
 /// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
 /// and the properties of the function either one or two registers will be
 /// needed. Set primary to true for the first register, false for the second.
-static unsigned
-GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
+static unsigned GetScratchRegister(bool Is64Bit, bool IsLP64,
+                                   const MachineFunction &MF, bool Primary) {
   CallingConv::ID CallingConvention = MF.getFunction().getCallingConv();
 
   // Erlang stuff.
@@ -3148,7 +3207,7 @@ void X86FrameLowering::adjustForSegmentedStacks(
       TlsOffset = IsLP64 ? 0x70 : 0x40;
     } else if (STI.isTargetDarwin()) {
       TlsReg = X86::GS;
-      TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+      TlsOffset = 0x60 + 90 * 8; // See pthread_machdep.h. Steal TLS slot 90.
     } else if (STI.isTargetWin64()) {
       TlsReg = X86::GS;
       TlsOffset = 0x28; // pvArbitrary, reserved for application use
@@ -3165,18 +3224,28 @@ void X86FrameLowering::adjustForSegmentedStacks(
     if (CompareStackPointer)
       ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
     else
-      BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
-        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+      BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r),
+              ScratchReg)
+          .addReg(X86::RSP)
+          .addImm(1)
+          .addReg(0)
+          .addImm(-StackSize)
+          .addReg(0);
 
-    BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
-      .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+    BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm))
+        .addReg(ScratchReg)
+        .addReg(0)
+        .addImm(1)
+        .addReg(0)
+        .addImm(TlsOffset)
+        .addReg(TlsReg);
   } else {
     if (STI.isTargetLinux()) {
       TlsReg = X86::GS;
       TlsOffset = 0x30;
     } else if (STI.isTargetDarwin()) {
       TlsReg = X86::GS;
-      TlsOffset = 0x48 + 90*4;
+      TlsOffset = 0x48 + 90 * 4;
     } else if (STI.isTargetWin32()) {
       TlsReg = X86::FS;
       TlsOffset = 0x14; // pvArbitrary, reserved for application use
@@ -3192,13 +3261,22 @@ void X86FrameLowering::adjustForSegmentedStacks(
     if (CompareStackPointer)
       ScratchReg = X86::ESP;
     else
-      BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
-        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+      BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg)
+          .addReg(X86::ESP)
+          .addImm(1)
+          .addReg(0)
+          .addImm(-StackSize)
+          .addReg(0);
 
     if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
         STI.isTargetDragonFly()) {
-      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
-        .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+          .addReg(ScratchReg)
+          .addReg(0)
+          .addImm(0)
+          .addReg(0)
+          .addImm(TlsOffset)
+          .addReg(TlsReg);
     } else if (STI.isTargetDarwin()) {
 
       // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
@@ -3223,15 +3301,17 @@ void X86FrameLowering::adjustForSegmentedStacks(
 
       if (SaveScratch2)
         BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
-          .addReg(ScratchReg2, RegState::Kill);
+            .addReg(ScratchReg2, RegState::Kill);
 
       BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
-        .addImm(TlsOffset);
+          .addImm(TlsOffset);
       BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
-        .addReg(ScratchReg)
-        .addReg(ScratchReg2).addImm(1).addReg(0)
-        .addImm(0)
-        .addReg(TlsReg);
+          .addReg(ScratchReg)
+          .addReg(ScratchReg2)
+          .addImm(1)
+          .addReg(0)
+          .addImm(0)
+          .addReg(TlsReg);
 
       if (SaveScratch2)
         BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
@@ -3240,7 +3320,9 @@ void X86FrameLowering::adjustForSegmentedStacks(
 
   // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
   // It jumps to normal execution of the function body.
-  BuildMI(checkMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_A);
+  BuildMI(checkMBB, DL, TII.get(X86::JCC_1))
+      .addMBB(&PrologueMBB)
+      .addImm(X86::COND_A);
 
   // On 32 bit we first push the arguments size and then the frame size. On 64
   // bit, we pass the stack frame size in r10 and the argument size in r11.
@@ -3264,9 +3346,8 @@ void X86FrameLowering::adjustForSegmentedStacks(
         .addImm(X86FI->getArgumentStackSize());
   } else {
     BuildMI(allocMBB, DL, TII.get(X86::PUSH32i))
-      .addImm(X86FI->getArgumentStackSize());
-    BuildMI(allocMBB, DL, TII.get(X86::PUSH32i))
-      .addImm(StackSize);
+        .addImm(X86FI->getArgumentStackSize());
+    BuildMI(allocMBB, DL, TII.get(X86::PUSH32i)).addImm(StackSize);
   }
 
   // __morestack is in libgcc
@@ -3298,10 +3379,10 @@ void X86FrameLowering::adjustForSegmentedStacks(
   } else {
     if (Is64Bit)
       BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
-        .addExternalSymbol("__morestack");
+          .addExternalSymbol("__morestack");
     else
       BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
-        .addExternalSymbol("__morestack");
+          .addExternalSymbol("__morestack");
   }
 
   if (IsNested)
@@ -3323,22 +3404,24 @@ void X86FrameLowering::adjustForSegmentedStacks(
 /// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets
 /// to fields it needs, through a named metadata node "hipe.literals" containing
 /// name-value pairs.
-static unsigned getHiPELiteral(
-    NamedMDNode *HiPELiteralsMD, const StringRef LiteralName) {
+static unsigned getHiPELiteral(NamedMDNode *HiPELiteralsMD,
+                               const StringRef LiteralName) {
   for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {
     MDNode *Node = HiPELiteralsMD->getOperand(i);
-    if (Node->getNumOperands() != 2) continue;
+    if (Node->getNumOperands() != 2)
+      continue;
     MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0));
     ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1));
-    if (!NodeName || !NodeVal) continue;
+    if (!NodeName || !NodeVal)
+      continue;
     ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue());
     if (ValConst && NodeName->getString() == LiteralName) {
       return ValConst->getZExtValue();
     }
   }
 
-  report_fatal_error("HiPE literal " + LiteralName
-                     + " required but not provided");
+  report_fatal_error("HiPE literal " + LiteralName +
+                     " required but not provided");
 }
 
 // Return true if there are no non-ehpad successors to MBB and there are no
@@ -3378,19 +3461,19 @@ void X86FrameLowering::adjustForHiPEPrologue(
   assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
 
   // HiPE-specific values
-  NamedMDNode *HiPELiteralsMD = MF.getMMI().getModule()
-    ->getNamedMetadata("hipe.literals");
+  NamedMDNode *HiPELiteralsMD =
+      MF.getMMI().getModule()->getNamedMetadata("hipe.literals");
   if (!HiPELiteralsMD)
     report_fatal_error(
         "Can't generate HiPE prologue without runtime parameters");
-  const unsigned HipeLeafWords
-    = getHiPELiteral(HiPELiteralsMD,
-                     Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
+  const unsigned HipeLeafWords = getHiPELiteral(
+      HiPELiteralsMD, Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
   const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
   const unsigned Guaranteed = HipeLeafWords * SlotSize;
-  unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs ?
-                            MF.getFunction().arg_size() - CCRegisteredArgs : 0;
-  unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize;
+  unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs
+                                ? MF.getFunction().arg_size() - CCRegisteredArgs
+                                : 0;
+  unsigned MaxStack = MFI.getStackSize() + CallerStkArity * SlotSize + SlotSize;
 
   assert(STI.isTargetLinux() &&
          "HiPE prologue is only supported on Linux operating systems.");
@@ -3430,11 +3513,13 @@ void X86FrameLowering::adjustForHiPEPrologue(
             F->getName().find_first_of("._") == StringRef::npos)
           continue;
 
-        unsigned CalleeStkArity =
-          F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+        unsigned CalleeStkArity = F->arg_size() > CCRegisteredArgs
+                                      ? F->arg_size() - CCRegisteredArgs
+                                      : 0;
         if (HipeLeafWords - 1 > CalleeStkArity)
-          MoreStackForCalls = std::max(MoreStackForCalls,
-                               (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+          MoreStackForCalls =
+              std::max(MoreStackForCalls,
+                       (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
       }
     }
     MaxStack += MoreStackForCalls;
@@ -3459,13 +3544,13 @@ void X86FrameLowering::adjustForHiPEPrologue(
     SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT");
     if (Is64Bit) {
       SPReg = X86::RSP;
-      PReg  = X86::RBP;
+      PReg = X86::RBP;
       LEAop = X86::LEA64r;
       CMPop = X86::CMP64rm;
       CALLop = X86::CALL64pcrel32;
     } else {
       SPReg = X86::ESP;
-      PReg  = X86::EBP;
+      PReg = X86::EBP;
       LEAop = X86::LEA32r;
       CMPop = X86::CMP32rm;
       CALLop = X86::CALLpcrel32;
@@ -3476,21 +3561,24 @@ void X86FrameLowering::adjustForHiPEPrologue(
            "HiPE prologue scratch register is live-in");
 
     // Create new MBB for StackCheck:
-    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
-                 SPReg, false, -MaxStack);
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), SPReg,
+                 false, -MaxStack);
     // SPLimitOffset is in a fixed heap location (pointed by BP).
-    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
-                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
-    BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_AE);
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)).addReg(ScratchReg),
+                 PReg, false, SPLimitOffset);
+    BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1))
+        .addMBB(&PrologueMBB)
+        .addImm(X86::COND_AE);
 
     // Create new MBB for IncStack:
-    BuildMI(incStackMBB, DL, TII.get(CALLop)).
-      addExternalSymbol("inc_stack_0");
-    addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
-                 SPReg, false, -MaxStack);
-    addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
-                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
-    BuildMI(incStackMBB, DL, TII.get(X86::JCC_1)).addMBB(incStackMBB).addImm(X86::COND_LE);
+    BuildMI(incStackMBB, DL, TII.get(CALLop)).addExternalSymbol("inc_stack_0");
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), SPReg,
+                 false, -MaxStack);
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)).addReg(ScratchReg),
+                 PReg, false, SPLimitOffset);
+    BuildMI(incStackMBB, DL, TII.get(X86::JCC_1))
+        .addMBB(incStackMBB)
+        .addImm(X86::COND_LE);
 
     stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
     stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
@@ -3570,15 +3658,15 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
     Regs[FoundRegs++] = Regs[0];
 
   for (int i = 0; i < NumPops; ++i)
-    BuildMI(MBB, MBBI, DL,
-            TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]);
+    BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r),
+            Regs[i]);
 
   return true;
 }
 
-MachineBasicBlock::iterator X86FrameLowering::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
+MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
   bool reserveCallFrame = hasReservedCallFrame(MF);
   unsigned Opcode = I->getOpcode();
   bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
@@ -3666,9 +3754,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // TODO: When not using precise CFA, we also need to adjust for the
       // InternalAmt here.
       if (CfaAdjustment) {
-        BuildCFI(MBB, InsertPos, DL,
-                 MCCFIInstruction::createAdjustCfaOffset(nullptr,
-                                                         CfaAdjustment));
+        BuildCFI(
+            MBB, InsertPos, DL,
+            MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment));
       }
     }
 
@@ -3837,11 +3925,11 @@ X86FrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
 namespace {
 // Struct used by orderFrameObjects to help sort the stack objects.
 struct X86FrameSortingObject {
-  bool IsValid = false;         // true if we care about this Object.
-  unsigned ObjectIndex = 0;     // Index of Object into MFI list.
-  unsigned ObjectSize = 0;      // Size of Object in bytes.
+  bool IsValid = false;             // true if we care about this Object.
+  unsigned ObjectIndex = 0;         // Index of Object into MFI list.
+  unsigned ObjectSize = 0;          // Size of Object in bytes.
   Align ObjectAlignment = Align(1); // Alignment of Object in bytes.
-  unsigned ObjectNumUses = 0;   // Object static number of uses.
+  unsigned ObjectNumUses = 0;       // Object static number of uses.
 };
 
 // The comparison function we use for std::sort to order our local
@@ -3881,9 +3969,9 @@ struct X86FrameSortingComparator {
     // the division and, with it, the need for any floating point
     // arithmetic.
     DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
-      static_cast<uint64_t>(B.ObjectSize);
+                     static_cast<uint64_t>(B.ObjectSize);
     DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
-      static_cast<uint64_t>(A.ObjectSize);
+                     static_cast<uint64_t>(A.ObjectSize);
 
     // If the two densities are equal, prioritize highest alignment
     // objects. This allows for similar alignment objects
@@ -3976,8 +4064,8 @@ void X86FrameLowering::orderFrameObjects(
     std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
 }
 
-
-unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
+unsigned
+X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
   // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
   unsigned Offset = 16;
   // RBP is immediately pushed.