diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86FrameLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/X86/X86FrameLowering.cpp | 574 |
1 files changed, 331 insertions, 243 deletions
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index a5a4f91299f3..c0d358ead278 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -17,7 +17,6 @@ #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -42,6 +41,7 @@ STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue"); STATISTIC(NumFrameExtraProbe, "Number of extra stack probes generated in prologue"); +STATISTIC(NumFunctionUsingPush2Pop2, "Number of funtions using push2/pop2"); using namespace llvm; @@ -69,8 +69,8 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { /// call frame pseudos can be simplified. Having a FP, as in the default /// implementation, is not sufficient here since we can't always use it. /// Use a more nuanced condition. -bool -X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { +bool X86FrameLowering::canSimplifyCallFramePseudos( + const MachineFunction &MF) const { return hasReservedCallFrame(MF) || MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() || (hasFP(MF) && !TRI->hasStackRealignment(MF)) || @@ -84,8 +84,8 @@ X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { // that were not simplified earlier. // So, this is required for x86 functions that have push sequences even // when there are no stack objects. -bool -X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { +bool X86FrameLowering::needsFrameIndexResolution( + const MachineFunction &MF) const { return MF.getFrameInfo().hasStackObjects() || MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); } @@ -140,6 +140,38 @@ static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) { return X86::MOV32ri; } +// Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the +// value written by the PUSH from the stack. The processor tracks these marked +// instructions internally and fast-forwards register data between matching PUSH +// and POP instructions, without going through memory or through the training +// loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient +// memory-renaming optimization can be used. +// +// The PPX hint is purely a performance hint. Instructions with this hint have +// the same functional semantics as those without. PPX hints set by the +// compiler that violate the balancing rule may turn off the PPX optimization, +// but they will not affect program semantics. +// +// Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp +// are not considered). +// +// PUSH2 and POP2 are instructions for (respectively) pushing/popping 2 +// GPRs at a time to/from the stack. +static unsigned getPUSHOpcode(const X86Subtarget &ST) { + return ST.is64Bit() ? (ST.hasPPX() ? X86::PUSHP64r : X86::PUSH64r) + : X86::PUSH32r; +} +static unsigned getPOPOpcode(const X86Subtarget &ST) { + return ST.is64Bit() ? (ST.hasPPX() ? X86::POPP64r : X86::POP64r) + : X86::POP32r; +} +static unsigned getPUSH2Opcode(const X86Subtarget &ST) { + return ST.hasPPX() ? X86::PUSH2P : X86::PUSH2; +} +static unsigned getPOP2Opcode(const X86Subtarget &ST) { + return ST.hasPPX() ? X86::POP2P : X86::POP2; +} + static bool isEAXLiveIn(MachineBasicBlock &MBB) { for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) { unsigned Reg = RegMask.PhysReg; @@ -195,8 +227,8 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { /// stack pointer by a constant value. void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &DL, - int64_t NumBytes, bool InEpilogue) const { + const DebugLoc &DL, int64_t NumBytes, + bool InEpilogue) const { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; MachineInstr::MIFlag Flag = @@ -280,13 +312,11 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, if (ThisVal == SlotSize) { // Use push / pop for slot sized adjustments as a size optimization. We // need to find a dead register when using pop. - unsigned Reg = isSub - ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) - : TRI->findDeadCallerSavedReg(MBB, MBBI); + unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) + : TRI->findDeadCallerSavedReg(MBB, MBBI); if (Reg) { - unsigned Opc = isSub - ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) - : (Is64Bit ? X86::POP64r : X86::POP32r); + unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) + : (Is64Bit ? X86::POP64r : X86::POP32r); BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)) .setMIFlag(Flag); @@ -562,49 +592,13 @@ void X86FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero, RegsToZero.reset(Reg); } + // Zero out the GPRs first. for (MCRegister Reg : GPRsToZero.set_bits()) - BuildMI(MBB, MBBI, DL, TII.get(X86::XOR32rr), Reg) - .addReg(Reg, RegState::Undef) - .addReg(Reg, RegState::Undef); - - // Zero out registers. - for (MCRegister Reg : RegsToZero.set_bits()) { - if (ST.hasMMX() && X86::VR64RegClass.contains(Reg)) - // FIXME: Ignore MMX registers? - continue; + TII.buildClearRegister(Reg, MBB, MBBI, DL); - unsigned XorOp; - if (X86::VR128RegClass.contains(Reg)) { - // XMM# - if (!ST.hasSSE1()) - continue; - XorOp = X86::PXORrr; - } else if (X86::VR256RegClass.contains(Reg)) { - // YMM# - if (!ST.hasAVX()) - continue; - XorOp = X86::VPXORrr; - } else if (X86::VR512RegClass.contains(Reg)) { - // ZMM# - if (!ST.hasAVX512()) - continue; - XorOp = X86::VPXORYrr; - } else if (X86::VK1RegClass.contains(Reg) || - X86::VK2RegClass.contains(Reg) || - X86::VK4RegClass.contains(Reg) || - X86::VK8RegClass.contains(Reg) || - X86::VK16RegClass.contains(Reg)) { - if (!ST.hasVLX()) - continue; - XorOp = ST.hasBWI() ? X86::KXORQrr : X86::KXORWrr; - } else { - continue; - } - - BuildMI(MBB, MBBI, DL, TII.get(XorOp), Reg) - .addReg(Reg, RegState::Undef) - .addReg(Reg, RegState::Undef); - } + // Zero out the remaining registers. + for (MCRegister Reg : RegsToZero.set_bits()) + TII.buildClearRegister(Reg, MBB, MBBI, DL); } void X86FrameLowering::emitStackProbe( @@ -959,24 +953,16 @@ void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64( // registers. For the prolog expansion we use RAX, RCX and RDX. MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RegClass = &X86::GR64RegClass; - const Register SizeReg = InProlog ? X86::RAX - : MRI.createVirtualRegister(RegClass), - ZeroReg = InProlog ? X86::RCX - : MRI.createVirtualRegister(RegClass), - CopyReg = InProlog ? X86::RDX - : MRI.createVirtualRegister(RegClass), - TestReg = InProlog ? X86::RDX - : MRI.createVirtualRegister(RegClass), - FinalReg = InProlog ? X86::RDX - : MRI.createVirtualRegister(RegClass), - RoundedReg = InProlog ? X86::RDX - : MRI.createVirtualRegister(RegClass), - LimitReg = InProlog ? X86::RCX - : MRI.createVirtualRegister(RegClass), - JoinReg = InProlog ? X86::RCX - : MRI.createVirtualRegister(RegClass), - ProbeReg = InProlog ? X86::RCX - : MRI.createVirtualRegister(RegClass); + const Register + SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass), + ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), + CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), + TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), + FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), + RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass), + LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), + JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass), + ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass); // SP-relative offsets where we can save RCX and RDX. int64_t RCXShadowSlot = 0; @@ -1048,7 +1034,9 @@ void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64( .addReg(X86::GS); BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg); // Jump if the desired stack pointer is at or above the stack limit. - BuildMI(&MBB, DL, TII.get(X86::JCC_1)).addMBB(ContinueMBB).addImm(X86::COND_AE); + BuildMI(&MBB, DL, TII.get(X86::JCC_1)) + .addMBB(ContinueMBB) + .addImm(X86::COND_AE); // Add code to roundMBB to round the final stack pointer to a page boundary. RoundMBB->addLiveIn(FinalReg); @@ -1085,7 +1073,9 @@ void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64( BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) .addReg(RoundedReg) .addReg(ProbeReg); - BuildMI(LoopMBB, DL, TII.get(X86::JCC_1)).addMBB(LoopMBB).addImm(X86::COND_NE); + BuildMI(LoopMBB, DL, TII.get(X86::JCC_1)) + .addMBB(LoopMBB) + .addImm(X86::COND_NE); MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); @@ -1169,7 +1159,7 @@ void X86FrameLowering::emitStackProbeCall( CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); } else { CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)) - .addExternalSymbol(MF.createExternalSymbolName(Symbol)); + .addExternalSymbol(MF.createExternalSymbolName(Symbol)); } unsigned AX = Uses64BitFramePtr ? X86::RAX : X86::EAX; @@ -1231,7 +1221,8 @@ static unsigned calculateSetFPREG(uint64_t SPAdjust) { // info, we need to know the ABI stack alignment as well in case we // have a call out. Otherwise just make sure we have some alignment - we'll // go with the minimum SlotSize. -uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const { +uint64_t +X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment. Align StackAlign = getStackAlign(); @@ -1322,8 +1313,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, // Loop entry block { - const unsigned SUBOpc = - getSUBriOpcode(Uses64BitFramePtr); + const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr); BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr) .addReg(StackPtr) .addImm(StackProbeSize) @@ -1353,8 +1343,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, .addImm(0) .setMIFlag(MachineInstr::FrameSetup); - const unsigned SUBOpc = - getSUBriOpcode(Uses64BitFramePtr); + const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr); BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr) .addReg(StackPtr) .addImm(StackProbeSize) @@ -1405,7 +1394,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, } } -bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const { +bool X86FrameLowering::has128ByteRedZone(const MachineFunction &MF) const { // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be // clobbered by any interrupt handler. assert(&STI == &MF.getSubtarget<X86Subtarget>() && @@ -1521,7 +1510,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. - uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate. + uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate. bool IsFunclet = MBB.isEHFuncletEntry(); EHPersonality Personality = EHPersonality::Unknown; if (Fn.hasPersonalityFn()) @@ -1539,8 +1528,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, bool NeedsDwarfCFI = needsDwarfCFI(MF); Register FramePtr = TRI->getFrameRegister(MF); const Register MachineFramePtr = - STI.isTarget64BitILP32() - ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr; + STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64)) + : FramePtr; Register BasePtr = TRI->getBaseRegister(); bool HasWinCFI = false; @@ -1575,7 +1564,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign); int64_t Offset = -(int64_t)SlotSize; - BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64rmm: X86::PUSH32rmm)) + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm)) .addReg(ArgBaseReg) .addImm(1) .addReg(X86::NoRegister) @@ -1587,7 +1576,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Space reserved for stack-based arguments when making a (ABI-guaranteed) // tail call. unsigned TailCallArgReserveSize = -X86FI->getTCReturnAddrDelta(); - if (TailCallArgReserveSize && IsWin64Prologue) + if (TailCallArgReserveSize && IsWin64Prologue) report_fatal_error("Can't handle guaranteed tail call under win64 yet"); const bool EmitStackProbeCall = @@ -1659,7 +1648,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize() - X86FI->getTCReturnAddrDelta(); - if (HasFP) MinSize += SlotSize; + if (HasFP) + MinSize += SlotSize; X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0); StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI.setStackSize(StackSize); @@ -1714,17 +1704,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - NumBytes = FrameSize - - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize); + NumBytes = + FrameSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize); // Callee-saved registers are pushed on stack before the stack is realigned. if (TRI->hasStackRealignment(MF) && !IsWin64Prologue) NumBytes = alignTo(NumBytes, MaxAlign); // Save EBP/RBP into the appropriate stack slot. - BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) - .addReg(MachineFramePtr, RegState::Kill) - .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, + TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>()))) + .addReg(MachineFramePtr, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI && !ArgBaseReg.isValid()) { // Mark the place where EBP/RBP was saved. @@ -1839,8 +1830,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } } else { assert(!IsFunclet && "funclets without FPs not yet implemented"); - NumBytes = StackSize - - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize); + NumBytes = + StackSize - (X86FI->getCalleeSavedFrameSize() + TailCallArgReserveSize); } // Update the offset adjustment, which is mainly used by codeview to translate @@ -1861,19 +1852,30 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Skip the callee-saved push instructions. bool PushedRegs = false; int StackOffset = 2 * stackGrowth; + MachineBasicBlock::const_iterator LastCSPush = MBBI; + auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) { + if (MBBI == MBB.end() || !MBBI->getFlag(MachineInstr::FrameSetup)) + return false; + unsigned Opc = MBBI->getOpcode(); + return Opc == X86::PUSH32r || Opc == X86::PUSH64r || Opc == X86::PUSHP64r || + Opc == X86::PUSH2 || Opc == X86::PUSH2P; + }; - while (MBBI != MBB.end() && - MBBI->getFlag(MachineInstr::FrameSetup) && - (MBBI->getOpcode() == X86::PUSH32r || - MBBI->getOpcode() == X86::PUSH64r)) { + while (IsCSPush(MBBI)) { PushedRegs = true; Register Reg = MBBI->getOperand(0).getReg(); + LastCSPush = MBBI; ++MBBI; + unsigned Opc = LastCSPush->getOpcode(); if (!HasFP && NeedsDwarfCFI) { // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); + // Compared to push, push2 introduces more stack offset (one more + // register). + if (Opc == X86::PUSH2 || Opc == X86::PUSH2P) + StackOffset += stackGrowth; BuildCFI(MBB, MBBI, DL, MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset), MachineInstr::FrameSetup); @@ -1885,6 +1887,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) .addImm(Reg) .setMIFlag(MachineInstr::FrameSetup); + if (Opc == X86::PUSH2 || Opc == X86::PUSH2P) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + .addImm(LastCSPush->getOperand(1).getReg()) + .setMIFlag(MachineInstr::FrameSetup); } } @@ -1933,13 +1939,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (Is64Bit) { // Save RAX BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r)) - .addReg(X86::RAX, RegState::Kill) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(X86::RAX, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); } else { // Save EAX BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) - .addReg(X86::EAX, RegState::Kill) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(X86::EAX, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); } } @@ -2122,16 +2128,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Update the base pointer with the current stack pointer. unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) - .addReg(SPOrEstablisher) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(SPOrEstablisher) + .setMIFlag(MachineInstr::FrameSetup); if (X86FI->getRestoreBasePointer()) { // Stash value of base pointer. Saving RSP instead of EBP shortens // dependence chain. Used by SjLj EH. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), - FramePtr, true, X86FI->getRestoreBasePointerOffset()) - .addReg(SPOrEstablisher) - .setMIFlag(MachineInstr::FrameSetup); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true, + X86FI->getRestoreBasePointerOffset()) + .addReg(SPOrEstablisher) + .setMIFlag(MachineInstr::FrameSetup); } if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) { @@ -2244,9 +2250,9 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { // This is the size of the pushed CSRs. unsigned CSSize = X86FI->getCalleeSavedFrameSize(); // This is the size of callee saved XMMs. - const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); - unsigned XMMSize = WinEHXMMSlotInfo.size() * - TRI->getSpillSize(X86::VR128RegClass); + const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); + unsigned XMMSize = + WinEHXMMSlotInfo.size() * TRI->getSpillSize(X86::VR128RegClass); // This is the amount of stack a funclet needs to allocate. unsigned UsedSize; EHPersonality Personality = @@ -2270,10 +2276,9 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { } static bool isTailCallOpcode(unsigned Opc) { - return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi || - Opc == X86::TCRETURNmi || - Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNdi64 || - Opc == X86::TCRETURNmi64; + return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi || + Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 || + Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64; } void X86FrameLowering::emitEpilogue(MachineFunction &MF, @@ -2359,18 +2364,18 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (X86FI->hasSwiftAsyncContext()) { // Discard the context. int Offset = 16 + mergeSPUpdates(MBB, MBBI, true); - emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/true); + emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true); } // Pop EBP. - BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + BuildMI(MBB, MBBI, DL, + TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())), MachineFramePtr) .setMIFlag(MachineInstr::FrameDestroy); // We need to reset FP to its untagged state on return. Bit 60 is currently // used to show the presence of an extended frame. if (X86FI->hasSwiftAsyncContext()) { - BuildMI(MBB, MBBI, DL, TII.get(X86::BTR64ri8), - MachineFramePtr) + BuildMI(MBB, MBBI, DL, TII.get(X86::BTR64ri8), MachineFramePtr) .addUse(MachineFramePtr) .addImm(60) .setMIFlag(MachineInstr::FrameDestroy); @@ -2403,10 +2408,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, unsigned Opc = PI->getOpcode(); if (Opc != X86::DBG_VALUE && !PI->isTerminator()) { - if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) && - (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) && - (Opc != X86::BTR64ri8 || !PI->getFlag(MachineInstr::FrameDestroy)) && - (Opc != X86::ADD64ri32 || !PI->getFlag(MachineInstr::FrameDestroy))) + if (!PI->getFlag(MachineInstr::FrameDestroy) || + (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 && + Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 && + Opc != X86::POP2P && Opc != X86::LEA64r)) break; FirstCSPop = PI; } @@ -2458,13 +2463,12 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // effects of the prologue can safely be undone. if (LEAAmount != 0) { unsigned Opc = getLEArOpcode(Uses64BitFramePtr); - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), - FramePtr, false, LEAAmount); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr, + false, LEAAmount); --MBBI; } else { unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); - BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) - .addReg(FramePtr); + BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr).addReg(FramePtr); --MBBI; } } else if (NumBytes) { @@ -2498,8 +2502,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator PI = MBBI; unsigned Opc = PI->getOpcode(); ++MBBI; - if (Opc == X86::POP32r || Opc == X86::POP64r) { + if (Opc == X86::POP32r || Opc == X86::POP64r || Opc == X86::POPP64r || + Opc == X86::POP2 || Opc == X86::POP2P) { Offset += SlotSize; + // Compared to pop, pop2 introduces more stack offset (one more + // register). + if (Opc == X86::POP2 || Opc == X86::POP2P) + Offset += SlotSize; BuildCFI(MBB, MBBI, DL, MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset), MachineInstr::FrameDestroy); @@ -2570,7 +2579,8 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - // If required, include space for extra hidden slot for stashing base pointer. + // If required, include space for extra hidden slot for stashing base + // pointer. if (X86FI->getRestoreBasePointer()) FrameSize += SlotSize; uint64_t NumBytes = FrameSize - CSSize; @@ -2615,7 +2625,7 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI, Register &FrameReg) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); + const auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); const auto it = WinEHXMMSlotInfo.find(FI); if (it == WinEHXMMSlotInfo.end()) @@ -2743,7 +2753,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // } // [EBP] MFI.CreateFixedObject(-TailCallReturnAddrDelta, - TailCallReturnAddrDelta - SlotSize, true); + TailCallReturnAddrDelta - SlotSize, true); } // Spill the BasePtr if it's used. @@ -2774,13 +2784,37 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // about avoiding it later. Register FPReg = TRI->getFrameRegister(MF); for (unsigned i = 0; i < CSI.size(); ++i) { - if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { + if (TRI->regsOverlap(CSI[i].getReg(), FPReg)) { CSI.erase(CSI.begin() + i); break; } } } + // Strategy: + // 1. Use push2 when + // a) number of CSR > 1 if no need padding + // b) number of CSR > 2 if need padding + // 2. When the number of CSR push is odd + // a. Start to use push2 from the 1st push if stack is 16B aligned. + // b. Start to use push2 from the 2nd push if stack is not 16B aligned. + // 3. When the number of CSR push is even, start to use push2 from the 1st + // push and make the stack 16B aligned before the push + unsigned NumRegsForPush2 = 0; + if (STI.hasPush2Pop2()) { + unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) { + return X86::GR64RegClass.contains(I.getReg()); + }); + bool NeedPadding = (SpillSlotOffset % 16 != 0) && (NumCSGPR % 2 == 0); + bool UsePush2Pop2 = NeedPadding ? NumCSGPR > 2 : NumCSGPR > 1; + X86FI->setPadForPush2Pop2(NeedPadding && UsePush2Pop2); + NumRegsForPush2 = UsePush2Pop2 ? alignDown(NumCSGPR, 2) : 0; + if (X86FI->padForPush2Pop2()) { + SpillSlotOffset -= SlotSize; + MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + } + } + // Assign slots for GPRs. It increases frame size. for (CalleeSavedInfo &I : llvm::reverse(CSI)) { Register Reg = I.getReg(); @@ -2788,6 +2822,13 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; + // A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned + // or only an odd number of registers in the candidates. + if (X86FI->getNumCandidatesForPush2Pop2() < NumRegsForPush2 && + (SpillSlotOffset % 16 == 0 || + X86FI->getNumCandidatesForPush2Pop2() % 2)) + X86FI->addCandidateForPush2Pop2(Reg); + SpillSlotOffset -= SlotSize; CalleeSavedFrameSize += SlotSize; @@ -2805,6 +2846,10 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // TODO: saving the slot index is better? X86FI->setRestoreBasePointer(CalleeSavedFrameSize); } + assert(X86FI->getNumCandidatesForPush2Pop2() % 2 == 0 && + "Expect even candidates for push2/pop2"); + if (X86FI->getNumCandidatesForPush2Pop2()) + ++NumFunctionUsingPush2Pop2; X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize); @@ -2854,40 +2899,50 @@ bool X86FrameLowering::spillCalleeSavedRegisters( // Push GPRs. It increases frame size. const MachineFunction &MF = *MBB.getParent(); - unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; - for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { - Register Reg = I.getReg(); - - if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) - continue; + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + if (X86FI->padForPush2Pop2()) + emitSPUpdate(MBB, MI, DL, -(int64_t)SlotSize, /*InEpilogue=*/false); + // Update LiveIn of the basic block and decide whether we can add a kill flag + // to the use. + auto UpdateLiveInCheckCanKill = [&](Register Reg) { const MachineRegisterInfo &MRI = MF.getRegInfo(); - bool isLiveIn = MRI.isLiveIn(Reg); - if (!isLiveIn) - MBB.addLiveIn(Reg); - - // Decide whether we can add a kill flag to the use. - bool CanKill = !isLiveIn; - // Check if any subregister is live-in - if (CanKill) { - for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) { - if (MRI.isLiveIn(*AReg)) { - CanKill = false; - break; - } - } - } - // Do not set a kill flag on values that are also marked as live-in. This // happens with the @llvm-returnaddress intrinsic and with arguments // passed in callee saved registers. // Omitting the kill flags is conservatively correct even if the live-in // is not used after all. - BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, getKillRegState(CanKill)) - .setMIFlag(MachineInstr::FrameSetup); + if (MRI.isLiveIn(Reg)) + return false; + MBB.addLiveIn(Reg); + // Check if any subregister is live-in + for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) + if (MRI.isLiveIn(*AReg)) + return false; + return true; + }; + auto UpdateLiveInGetKillRegState = [&](Register Reg) { + return getKillRegState(UpdateLiveInCheckCanKill(Reg)); + }; + + for (auto RI = CSI.rbegin(), RE = CSI.rend(); RI != RE; ++RI) { + Register Reg = RI->getReg(); + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) + continue; + + if (X86FI->isCandidateForPush2Pop2(Reg)) { + Register Reg2 = (++RI)->getReg(); + BuildMI(MBB, MI, DL, TII.get(getPUSH2Opcode(STI))) + .addReg(Reg, UpdateLiveInGetKillRegState(Reg)) + .addReg(Reg2, UpdateLiveInGetKillRegState(Reg2)) + .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(MBB, MI, DL, TII.get(getPUSHOpcode(STI))) + .addReg(Reg, UpdateLiveInGetKillRegState(Reg)) + .setMIFlag(MachineInstr::FrameSetup); + } } - const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); if (X86FI->getRestoreBasePointer()) { unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; Register BaseReg = this->TRI->getBaseRegister(); @@ -2979,8 +3034,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( // Reload XMMs from stack frame. for (const CalleeSavedInfo &I : CSI) { Register Reg = I.getReg(); - if (X86::GR64RegClass.contains(Reg) || - X86::GR32RegClass.contains(Reg)) + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; // If this is k-register make sure we lookup via the largest legal type. @@ -3004,16 +3058,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( } // POP GPRs. - unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; - for (const CalleeSavedInfo &I : CSI) { - Register Reg = I.getReg(); - if (!X86::GR64RegClass.contains(Reg) && - !X86::GR32RegClass.contains(Reg)) + for (auto I = CSI.begin(), E = CSI.end(); I != E; ++I) { + Register Reg = I->getReg(); + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; - BuildMI(MBB, MI, DL, TII.get(Opc), Reg) - .setMIFlag(MachineInstr::FrameDestroy); + if (X86FI->isCandidateForPush2Pop2(Reg)) + BuildMI(MBB, MI, DL, TII.get(getPOP2Opcode(STI)), Reg) + .addReg((++I)->getReg(), RegState::Define) + .setMIFlag(MachineInstr::FrameDestroy); + else + BuildMI(MBB, MI, DL, TII.get(getPOPOpcode(STI)), Reg) + .setMIFlag(MachineInstr::FrameDestroy); } + if (X86FI->padForPush2Pop2()) + emitSPUpdate(MBB, MI, DL, SlotSize, /*InEpilogue=*/true); + return true; } @@ -3023,7 +3083,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); // Spill the BasePtr if it's used. - if (TRI->hasBasePointer(MF)){ + if (TRI->hasBasePointer(MF)) { Register BasePtr = TRI->getBaseRegister(); if (STI.isTarget64BitILP32()) BasePtr = getX86SubSuperRegister(BasePtr, 64); @@ -3031,11 +3091,10 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, } } -static bool -HasNestArgument(const MachineFunction *MF) { +static bool HasNestArgument(const MachineFunction *MF) { const Function &F = MF->getFunction(); - for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); - I != E; I++) { + for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; + I++) { if (I->hasNestAttr() && !I->use_empty()) return true; } @@ -3046,8 +3105,8 @@ HasNestArgument(const MachineFunction *MF) { /// segmented stack and the Erlang/HiPE stack prologue. Depending on platform /// and the properties of the function either one or two registers will be /// needed. Set primary to true for the first register, false for the second. -static unsigned -GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { +static unsigned GetScratchRegister(bool Is64Bit, bool IsLP64, + const MachineFunction &MF, bool Primary) { CallingConv::ID CallingConvention = MF.getFunction().getCallingConv(); // Erlang stuff. @@ -3148,7 +3207,7 @@ void X86FrameLowering::adjustForSegmentedStacks( TlsOffset = IsLP64 ? 0x70 : 0x40; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; - TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. + TlsOffset = 0x60 + 90 * 8; // See pthread_machdep.h. Steal TLS slot 90. } else if (STI.isTargetWin64()) { TlsReg = X86::GS; TlsOffset = 0x28; // pvArbitrary, reserved for application use @@ -3165,18 +3224,28 @@ void X86FrameLowering::adjustForSegmentedStacks( if (CompareStackPointer) ScratchReg = IsLP64 ? X86::RSP : X86::ESP; else - BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP) - .addImm(1).addReg(0).addImm(-StackSize).addReg(0); + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), + ScratchReg) + .addReg(X86::RSP) + .addImm(1) + .addReg(0) + .addImm(-StackSize) + .addReg(0); - BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg) - .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)) + .addReg(ScratchReg) + .addReg(0) + .addImm(1) + .addReg(0) + .addImm(TlsOffset) + .addReg(TlsReg); } else { if (STI.isTargetLinux()) { TlsReg = X86::GS; TlsOffset = 0x30; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; - TlsOffset = 0x48 + 90*4; + TlsOffset = 0x48 + 90 * 4; } else if (STI.isTargetWin32()) { TlsReg = X86::FS; TlsOffset = 0x14; // pvArbitrary, reserved for application use @@ -3192,13 +3261,22 @@ void X86FrameLowering::adjustForSegmentedStacks( if (CompareStackPointer) ScratchReg = X86::ESP; else - BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) - .addImm(1).addReg(0).addImm(-StackSize).addReg(0); + BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg) + .addReg(X86::ESP) + .addImm(1) + .addReg(0) + .addImm(-StackSize) + .addReg(0); if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() || STI.isTargetDragonFly()) { - BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) - .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)) + .addReg(ScratchReg) + .addReg(0) + .addImm(0) + .addReg(0) + .addImm(TlsOffset) + .addReg(TlsReg); } else if (STI.isTargetDarwin()) { // TlsOffset doesn't fit into a mod r/m byte so we need an extra register. @@ -3223,15 +3301,17 @@ void X86FrameLowering::adjustForSegmentedStacks( if (SaveScratch2) BuildMI(checkMBB, DL, TII.get(X86::PUSH32r)) - .addReg(ScratchReg2, RegState::Kill); + .addReg(ScratchReg2, RegState::Kill); BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2) - .addImm(TlsOffset); + .addImm(TlsOffset); BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)) - .addReg(ScratchReg) - .addReg(ScratchReg2).addImm(1).addReg(0) - .addImm(0) - .addReg(TlsReg); + .addReg(ScratchReg) + .addReg(ScratchReg2) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(TlsReg); if (SaveScratch2) BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2); @@ -3240,7 +3320,9 @@ void X86FrameLowering::adjustForSegmentedStacks( // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. - BuildMI(checkMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_A); + BuildMI(checkMBB, DL, TII.get(X86::JCC_1)) + .addMBB(&PrologueMBB) + .addImm(X86::COND_A); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. @@ -3264,9 +3346,8 @@ void X86FrameLowering::adjustForSegmentedStacks( .addImm(X86FI->getArgumentStackSize()); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSH32i)) - .addImm(X86FI->getArgumentStackSize()); - BuildMI(allocMBB, DL, TII.get(X86::PUSH32i)) - .addImm(StackSize); + .addImm(X86FI->getArgumentStackSize()); + BuildMI(allocMBB, DL, TII.get(X86::PUSH32i)).addImm(StackSize); } // __morestack is in libgcc @@ -3298,10 +3379,10 @@ void X86FrameLowering::adjustForSegmentedStacks( } else { if (Is64Bit) BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack"); + .addExternalSymbol("__morestack"); else BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) - .addExternalSymbol("__morestack"); + .addExternalSymbol("__morestack"); } if (IsNested) @@ -3323,22 +3404,24 @@ void X86FrameLowering::adjustForSegmentedStacks( /// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets /// to fields it needs, through a named metadata node "hipe.literals" containing /// name-value pairs. -static unsigned getHiPELiteral( - NamedMDNode *HiPELiteralsMD, const StringRef LiteralName) { +static unsigned getHiPELiteral(NamedMDNode *HiPELiteralsMD, + const StringRef LiteralName) { for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) { MDNode *Node = HiPELiteralsMD->getOperand(i); - if (Node->getNumOperands() != 2) continue; + if (Node->getNumOperands() != 2) + continue; MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0)); ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1)); - if (!NodeName || !NodeVal) continue; + if (!NodeName || !NodeVal) + continue; ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue()); if (ValConst && NodeName->getString() == LiteralName) { return ValConst->getZExtValue(); } } - report_fatal_error("HiPE literal " + LiteralName - + " required but not provided"); + report_fatal_error("HiPE literal " + LiteralName + + " required but not provided"); } // Return true if there are no non-ehpad successors to MBB and there are no @@ -3378,19 +3461,19 @@ void X86FrameLowering::adjustForHiPEPrologue( assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet"); // HiPE-specific values - NamedMDNode *HiPELiteralsMD = MF.getMMI().getModule() - ->getNamedMetadata("hipe.literals"); + NamedMDNode *HiPELiteralsMD = + MF.getMMI().getModule()->getNamedMetadata("hipe.literals"); if (!HiPELiteralsMD) report_fatal_error( "Can't generate HiPE prologue without runtime parameters"); - const unsigned HipeLeafWords - = getHiPELiteral(HiPELiteralsMD, - Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS"); + const unsigned HipeLeafWords = getHiPELiteral( + HiPELiteralsMD, Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS"); const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; const unsigned Guaranteed = HipeLeafWords * SlotSize; - unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs ? - MF.getFunction().arg_size() - CCRegisteredArgs : 0; - unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize; + unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs + ? MF.getFunction().arg_size() - CCRegisteredArgs + : 0; + unsigned MaxStack = MFI.getStackSize() + CallerStkArity * SlotSize + SlotSize; assert(STI.isTargetLinux() && "HiPE prologue is only supported on Linux operating systems."); @@ -3430,11 +3513,13 @@ void X86FrameLowering::adjustForHiPEPrologue( F->getName().find_first_of("._") == StringRef::npos) continue; - unsigned CalleeStkArity = - F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0; + unsigned CalleeStkArity = F->arg_size() > CCRegisteredArgs + ? F->arg_size() - CCRegisteredArgs + : 0; if (HipeLeafWords - 1 > CalleeStkArity) - MoreStackForCalls = std::max(MoreStackForCalls, - (HipeLeafWords - 1 - CalleeStkArity) * SlotSize); + MoreStackForCalls = + std::max(MoreStackForCalls, + (HipeLeafWords - 1 - CalleeStkArity) * SlotSize); } } MaxStack += MoreStackForCalls; @@ -3459,13 +3544,13 @@ void X86FrameLowering::adjustForHiPEPrologue( SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT"); if (Is64Bit) { SPReg = X86::RSP; - PReg = X86::RBP; + PReg = X86::RBP; LEAop = X86::LEA64r; CMPop = X86::CMP64rm; CALLop = X86::CALL64pcrel32; } else { SPReg = X86::ESP; - PReg = X86::EBP; + PReg = X86::EBP; LEAop = X86::LEA32r; CMPop = X86::CMP32rm; CALLop = X86::CALLpcrel32; @@ -3476,21 +3561,24 @@ void X86FrameLowering::adjustForHiPEPrologue( "HiPE prologue scratch register is live-in"); // Create new MBB for StackCheck: - addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), - SPReg, false, -MaxStack); + addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), SPReg, + false, -MaxStack); // SPLimitOffset is in a fixed heap location (pointed by BP). - addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) - .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1)).addMBB(&PrologueMBB).addImm(X86::COND_AE); + addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)).addReg(ScratchReg), + PReg, false, SPLimitOffset); + BuildMI(stackCheckMBB, DL, TII.get(X86::JCC_1)) + .addMBB(&PrologueMBB) + .addImm(X86::COND_AE); // Create new MBB for IncStack: - BuildMI(incStackMBB, DL, TII.get(CALLop)). - addExternalSymbol("inc_stack_0"); - addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), - SPReg, false, -MaxStack); - addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) - .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(incStackMBB, DL, TII.get(X86::JCC_1)).addMBB(incStackMBB).addImm(X86::COND_LE); + BuildMI(incStackMBB, DL, TII.get(CALLop)).addExternalSymbol("inc_stack_0"); + addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), SPReg, + false, -MaxStack); + addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)).addReg(ScratchReg), + PReg, false, SPLimitOffset); + BuildMI(incStackMBB, DL, TII.get(X86::JCC_1)) + .addMBB(incStackMBB) + .addImm(X86::COND_LE); stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100}); stackCheckMBB->addSuccessor(incStackMBB, {1, 100}); @@ -3570,15 +3658,15 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, Regs[FoundRegs++] = Regs[0]; for (int i = 0; i < NumPops; ++i) - BuildMI(MBB, MBBI, DL, - TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]); + BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), + Regs[i]); return true; } -MachineBasicBlock::iterator X86FrameLowering:: -eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const { +MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { bool reserveCallFrame = hasReservedCallFrame(MF); unsigned Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); @@ -3666,9 +3754,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // TODO: When not using precise CFA, we also need to adjust for the // InternalAmt here. if (CfaAdjustment) { - BuildCFI(MBB, InsertPos, DL, - MCCFIInstruction::createAdjustCfaOffset(nullptr, - CfaAdjustment)); + BuildCFI( + MBB, InsertPos, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment)); } } @@ -3837,11 +3925,11 @@ X86FrameLowering::getDwarfFrameBase(const MachineFunction &MF) const { namespace { // Struct used by orderFrameObjects to help sort the stack objects. struct X86FrameSortingObject { - bool IsValid = false; // true if we care about this Object. - unsigned ObjectIndex = 0; // Index of Object into MFI list. - unsigned ObjectSize = 0; // Size of Object in bytes. + bool IsValid = false; // true if we care about this Object. + unsigned ObjectIndex = 0; // Index of Object into MFI list. + unsigned ObjectSize = 0; // Size of Object in bytes. Align ObjectAlignment = Align(1); // Alignment of Object in bytes. - unsigned ObjectNumUses = 0; // Object static number of uses. + unsigned ObjectNumUses = 0; // Object static number of uses. }; // The comparison function we use for std::sort to order our local @@ -3881,9 +3969,9 @@ struct X86FrameSortingComparator { // the division and, with it, the need for any floating point // arithmetic. DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) * - static_cast<uint64_t>(B.ObjectSize); + static_cast<uint64_t>(B.ObjectSize); DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) * - static_cast<uint64_t>(A.ObjectSize); + static_cast<uint64_t>(A.ObjectSize); // If the two densities are equal, prioritize highest alignment // objects. This allows for similar alignment objects @@ -3976,8 +4064,8 @@ void X86FrameLowering::orderFrameObjects( std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end()); } - -unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const { +unsigned +X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const { // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue. unsigned Offset = 16; // RBP is immediately pushed. |
