diff options
Diffstat (limited to 'lib/Target/AArch64/AArch64FrameLowering.cpp')
-rw-r--r-- | lib/Target/AArch64/AArch64FrameLowering.cpp | 895 |
1 files changed, 535 insertions, 360 deletions
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 3f63d049c34ed..82111e5c72593 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -93,6 +93,7 @@ #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -127,12 +128,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned NumBytes = AFI->getLocalStackSize(); - // Note: currently hasFP() is always true for hasCalls(), but that's an - // implementation detail of the current code, not a strict requirement, - // so stay safe here and check both. - if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128) - return false; - return true; + return !(MFI->hasCalls() || hasFP(MF) || NumBytes > 128); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -140,9 +136,12 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - return (MFI->hasCalls() || MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken() || MFI->hasStackMap() || - MFI->hasPatchPoint() || RegInfo->needsStackRealignment(MF)); + // Retain behavior of always omitting the FP for leaf functions when possible. + return (MFI->hasCalls() && + MF.getTarget().Options.DisableFramePointerElim(MF)) || + MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || + MFI->hasStackMap() || MFI->hasPatchPoint() || + RegInfo->needsStackRealignment(MF); } /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is @@ -155,7 +154,7 @@ AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { return !MF.getFrameInfo()->hasVarSizedObjects(); } -void AArch64FrameLowering::eliminateCallFramePseudoInstr( +MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { const AArch64InstrInfo *TII = @@ -170,7 +169,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( unsigned Align = getStackAlignment(); int64_t Amount = I->getOperand(0).getImm(); - Amount = RoundUpToAlignment(Amount, Align); + Amount = alignTo(Amount, Align); if (!IsDestroy) Amount = -Amount; @@ -186,7 +185,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses // LSL #0, and the other uses LSL #12. // - // Mostly call frames will be allocated at the start of a function so + // Most call frames will be allocated at the start of a function so // this is OK, but it is a limitation that needs dealing with. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII); @@ -198,12 +197,11 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount, TII); } - MBB.erase(I); + return MBB.erase(I); } void AArch64FrameLowering::emitCalleeSavedFrameMoves( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned FramePtr) const { + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); @@ -216,75 +214,194 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( if (CSI.empty()) return; - const DataLayout &TD = MF.getDataLayout(); - bool HasFP = hasFP(MF); - - // Calculate amount of bytes used for return address storing. - int stackGrowth = -TD.getPointerSize(0); - - // Calculate offsets. - int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth; - unsigned TotalSkipped = 0; for (const auto &Info : CSI) { unsigned Reg = Info.getReg(); - int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) - - getOffsetOfLocalArea() + saveAreaOffset; - - // Don't output a new CFI directive if we're re-saving the frame pointer or - // link register. This happens when the PrologEpilogInserter has inserted an - // extra "STP" of the frame pointer and link register -- the "emitPrologue" - // method automatically generates the directives when frame pointers are - // used. If we generate CFI directives for the extra "STP"s, the linker will - // lose track of the correct values for the frame pointer and link register. - if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) { - TotalSkipped += stackGrowth; - continue; - } - + int64_t Offset = + MFI->getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( - nullptr, DwarfReg, Offset - TotalSkipped)); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } } -/// Get FPOffset by analyzing the first instruction. -static int getFPOffsetInPrologue(MachineInstr *MBBI) { - // First instruction must a) allocate the stack and b) have an immediate - // that is a multiple of -2. - assert(((MBBI->getOpcode() == AArch64::STPXpre || - MBBI->getOpcode() == AArch64::STPDpre) && - MBBI->getOperand(3).getReg() == AArch64::SP && - MBBI->getOperand(4).getImm() < 0 && - (MBBI->getOperand(4).getImm() & 1) == 0)); - - // Frame pointer is fp = sp - 16. Since the STPXpre subtracts the space - // required for the callee saved register area we get the frame pointer - // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8. - int FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8; - assert(FPOffset >= 0 && "Bad Framepointer Offset"); - return FPOffset; -} +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { + MachineFunction *MF = MBB->getParent(); + + // If MBB is an entry block, use X9 as the scratch register + if (&MF->front() == MBB) + return AArch64::X9; + + const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); + LivePhysRegs LiveRegs(&TRI); + LiveRegs.addLiveIns(*MBB); + + // Mark callee saved registers as used so we will not choose them. + const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + // Prefer X9 since it was historically used for the prologue scratch reg. + const MachineRegisterInfo &MRI = MF->getRegInfo(); + if (LiveRegs.available(MRI, AArch64::X9)) + return AArch64::X9; -static bool isCSSave(MachineInstr *MBBI) { - return MBBI->getOpcode() == AArch64::STPXi || - MBBI->getOpcode() == AArch64::STPDi || - MBBI->getOpcode() == AArch64::STPXpre || - MBBI->getOpcode() == AArch64::STPDpre; + for (unsigned Reg : AArch64::GPR64RegClass) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + return AArch64::NoRegister; } bool AArch64FrameLowering::canUseAsPrologue( const MachineBasicBlock &MBB) const { const MachineFunction *MF = MBB.getParent(); + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); // Don't need a scratch register if we're not going to re-align the stack. - // Otherwise, we may need a scratch register to be available and we do not - // support that for now. - return !RegInfo->needsStackRealignment(*MF); + if (!RegInfo->needsStackRealignment(*MF)) + return true; + // Otherwise, we can use any block as long as it has a scratch register + // available. + return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; +} + +bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( + MachineFunction &MF, unsigned StackBumpBytes) const { + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + + if (AFI->getLocalStackSize() == 0) + return false; + + // 512 is the maximum immediate for stp/ldp that will be used for + // callee-save save/restores + if (StackBumpBytes >= 512) + return false; + + if (MFI->hasVarSizedObjects()) + return false; + + if (RegInfo->needsStackRealignment(MF)) + return false; + + // This isn't strictly necessary, but it simplifies things a bit since the + // current RedZone handling code assumes the SP is adjusted by the + // callee-save save/restore code. + if (canUseRedZone(MF)) + return false; + + return true; +} + +// Convert callee-save register save/restore instruction to do stack pointer +// decrement/increment to allocate/deallocate the callee-save stack area by +// converting store/load to use pre/post increment version. +static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) { + + unsigned NewOpc; + bool NewIsUnscaled = false; + switch (MBBI->getOpcode()) { + default: + llvm_unreachable("Unexpected callee-save save/restore opcode!"); + case AArch64::STPXi: + NewOpc = AArch64::STPXpre; + break; + case AArch64::STPDi: + NewOpc = AArch64::STPDpre; + break; + case AArch64::STRXui: + NewOpc = AArch64::STRXpre; + NewIsUnscaled = true; + break; + case AArch64::STRDui: + NewOpc = AArch64::STRDpre; + NewIsUnscaled = true; + break; + case AArch64::LDPXi: + NewOpc = AArch64::LDPXpost; + break; + case AArch64::LDPDi: + NewOpc = AArch64::LDPDpost; + break; + case AArch64::LDRXui: + NewOpc = AArch64::LDRXpost; + NewIsUnscaled = true; + break; + case AArch64::LDRDui: + NewOpc = AArch64::LDRDpost; + NewIsUnscaled = true; + break; + } + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + MIB.addReg(AArch64::SP, RegState::Define); + + // Copy all operands other than the immediate offset. + unsigned OpndIdx = 0; + for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd; + ++OpndIdx) + MIB.addOperand(MBBI->getOperand(OpndIdx)); + + assert(MBBI->getOperand(OpndIdx).getImm() == 0 && + "Unexpected immediate offset in first/last callee-save save/restore " + "instruction!"); + assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP && + "Unexpected base register in callee-save save/restore instruction!"); + // Last operand is immediate offset that needs fixing. + assert(CSStackSizeInc % 8 == 0); + int64_t CSStackSizeIncImm = CSStackSizeInc; + if (!NewIsUnscaled) + CSStackSizeIncImm /= 8; + MIB.addImm(CSStackSizeIncImm); + + MIB.setMIFlags(MBBI->getFlags()); + MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end()); + + return std::prev(MBB.erase(MBBI)); +} + +// Fixup callee-save register save/restore instructions to take into account +// combined SP bump by adding the local stack size to the stack offsets. +static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, + unsigned LocalStackSize) { + unsigned Opc = MI.getOpcode(); + (void)Opc; + assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi || + Opc == AArch64::STRXui || Opc == AArch64::STRDui || + Opc == AArch64::LDPXi || Opc == AArch64::LDPDi || + Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) && + "Unexpected callee-save save/restore opcode!"); + + unsigned OffsetIdx = MI.getNumExplicitOperands() - 1; + assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP && + "Unexpected base register in callee-save save/restore instruction!"); + // Last operand is immediate offset that needs fixing. + MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx); + // All generated opcodes have scaled offsets. + assert(LocalStackSize % 8 == 0); + OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8); } void AArch64FrameLowering::emitPrologue(MachineFunction &MF, @@ -316,40 +433,59 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); - // Label used to tie together the PROLOG_LABEL and the MachineMoves. - MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); - + if (!NumBytes) + return; // REDZONE: If the stack size is less than 128 bytes, we don't need // to actually allocate. - if (NumBytes && !canUseRedZone(MF)) { + if (canUseRedZone(MF)) + ++NumRedZoneFunctions; + else { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); + // Label used to tie together the PROLOG_LABEL and the MachineMoves. + MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); // Encode the stack size of the leaf function. unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); - } else if (NumBytes) { - ++NumRedZoneFunctions; } - return; } - // Only set up FP if we actually need to. - int FPOffset = 0; - if (HasFP) - FPOffset = getFPOffsetInPrologue(MBBI); + auto CSStackSize = AFI->getCalleeSavedStackSize(); + // All of the remaining stack allocations are for locals. + AFI->setLocalStackSize(NumBytes - CSStackSize); - // Move past the saves of the callee-saved registers. - while (isCSSave(MBBI)) { - ++MBBI; - NumBytes -= 16; + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + if (CombineSPBump) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + NumBytes = 0; + } else if (CSStackSize != 0) { + MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII, + -CSStackSize); + NumBytes -= CSStackSize; } assert(NumBytes >= 0 && "Negative stack allocation size!?"); + + // Move past the saves of the callee-saved registers, fixing up the offsets + // and pre-inc if we decided to combine the callee-save and local stack + // pointer bump above. + MachineBasicBlock::iterator End = MBB.end(); + while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) { + if (CombineSPBump) + fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize()); + ++MBBI; + } if (HasFP) { + // Only set up FP if we actually need to. Frame pointer is fp = sp - 16. + int FPOffset = CSStackSize - 16; + if (CombineSPBump) + FPOffset += AFI->getLocalStackSize(); + // Issue sub fp, sp, FPOffset or // mov fp,sp when FPOffset is zero. // Note: All stores of callee-saved registers are marked as "FrameSetup". @@ -358,47 +494,46 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup); } - // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes); - // Allocate space for the rest of the frame. + if (NumBytes) { + const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); + unsigned scratchSPReg = AArch64::SP; - const unsigned Alignment = MFI->getMaxAlignment(); - const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); - unsigned scratchSPReg = AArch64::SP; - if (NumBytes && NeedsRealignment) { - // Use the first callee-saved register as a scratch register. - scratchSPReg = AArch64::X9; - } + if (NeedsRealignment) { + scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); + assert(scratchSPReg != AArch64::NoRegister); + } - // If we're a leaf function, try using the red zone. - if (NumBytes && !canUseRedZone(MF)) - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup); + // If we're a leaf function, try using the red zone. + if (!canUseRedZone(MF)) + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); - if (NumBytes && NeedsRealignment) { - const unsigned NrBitsToZero = countTrailingZeros(Alignment); - assert(NrBitsToZero > 1); - assert(scratchSPReg != AArch64::SP); - - // SUB X9, SP, NumBytes - // -- X9 is temporary register, so shouldn't contain any live data here, - // -- free to use. This is already produced by emitFrameOffset above. - // AND SP, X9, 0b11111...0000 - // The logical immediates have a non-trivial encoding. The following - // formula computes the encoded immediate with all ones but - // NrBitsToZero zero bits as least significant bits. - uint32_t andMaskEncoded = - (1 <<12) // = N - | ((64-NrBitsToZero) << 6) // immr - | ((64-NrBitsToZero-1) << 0) // imms - ; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) - .addReg(scratchSPReg, RegState::Kill) - .addImm(andMaskEncoded); + if (NeedsRealignment) { + const unsigned Alignment = MFI->getMaxAlignment(); + const unsigned NrBitsToZero = countTrailingZeros(Alignment); + assert(NrBitsToZero > 1); + assert(scratchSPReg != AArch64::SP); + + // SUB X9, SP, NumBytes + // -- X9 is temporary register, so shouldn't contain any live data here, + // -- free to use. This is already produced by emitFrameOffset above. + // AND SP, X9, 0b11111...0000 + // The logical immediates have a non-trivial encoding. The following + // formula computes the encoded immediate with all ones but + // NrBitsToZero zero bits as least significant bits. + uint32_t andMaskEncoded = (1 << 12) // = N + | ((64 - NrBitsToZero) << 6) // immr + | ((64 - NrBitsToZero - 1) << 0); // imms + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) + .addReg(scratchSPReg, RegState::Kill) + .addImm(andMaskEncoded); + AFI->setStackRealigned(true); + } } // If we need a base pointer, set it up here. It's whatever the value of the @@ -491,21 +626,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); - - // Record the location of the stored LR - unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true); - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, LR, StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - - // Record the location of the stored FP - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); } else { // Encode the stack size of the leaf function. unsigned CFIIndex = MMI.addFrameInst( @@ -515,36 +635,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlags(MachineInstr::FrameSetup); } - // Now emit the moves for whatever callee saved regs we have. - emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr); - } -} - -static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - -/// Checks whether the given instruction restores callee save registers -/// and if so returns how many. -static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) { - unsigned RtIdx = 0; - switch (MI.getOpcode()) { - case AArch64::LDPXpost: - case AArch64::LDPDpost: - RtIdx = 1; - // FALLTHROUGH - case AArch64::LDPXi: - case AArch64::LDPDi: - if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) || - !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) || - MI.getOperand(RtIdx + 2).getReg() != AArch64::SP) - return 0; - return 2; + // Now emit the moves for whatever callee saved regs we have (including FP, + // LR if those are saved). + emitCalleeSavedFrameMoves(MBB, MBBI); } - return 0; } void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, @@ -552,7 +646,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); MachineFrameInfo *MFI = MF.getFrameInfo(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; bool IsTailCallReturn = false; @@ -599,7 +692,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // ---------------------| --- | // | | | | // | CalleeSavedReg | | | - // | (NumRestores * 8) | | | + // | (CalleeSavedStackSize)| | | // | | | | // ---------------------| | NumBytes // | | StackSize (StackAdjustUp) @@ -614,41 +707,74 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps // it as the 2nd argument of AArch64ISD::TC_RETURN. - NumBytes += ArgumentPopSize; - unsigned NumRestores = 0; + auto CSStackSize = AFI->getCalleeSavedStackSize(); + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + + if (!CombineSPBump && CSStackSize != 0) + convertCalleeSaveRestoreToSPPrePostIncDec( + MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize); + // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); MachineBasicBlock::iterator Begin = MBB.begin(); while (LastPopI != Begin) { --LastPopI; - unsigned Restores = getNumCSRestores(*LastPopI, CSRegs); - NumRestores += Restores; - if (Restores == 0) { + if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) { ++LastPopI; break; - } + } else if (CombineSPBump) + fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize()); + } + + // If there is a single SP update, insert it before the ret and we're done. + if (CombineSPBump) { + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + NumBytes + ArgumentPopSize, TII, + MachineInstr::FrameDestroy); + return; } - NumBytes -= NumRestores * 8; + + NumBytes -= CSStackSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { + bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the - // stack pointer. - if (!canUseRedZone(MF)) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, - TII); - return; + // stack pointer (but we may need to pop stack args for fastcc). + if (RedZone && ArgumentPopSize == 0) + return; + + bool NoCalleeSaveRestore = CSStackSize == 0; + int StackRestoreBytes = RedZone ? 0 : NumBytes; + if (NoCalleeSaveRestore) + StackRestoreBytes += ArgumentPopSize; + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + StackRestoreBytes, TII, MachineInstr::FrameDestroy); + // If we were able to combine the local stack pop with the argument pop, + // then we're done. + if (NoCalleeSaveRestore || ArgumentPopSize == 0) + return; + NumBytes = 0; } // Restore the original stack pointer. // FIXME: Rather than doing the math here, we should instead just use // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. - if (NumBytes || MFI->hasVarSizedObjects()) + if (MFI->hasVarSizedObjects() || AFI->isStackRealigned()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags); + -CSStackSize + 16, TII, MachineInstr::FrameDestroy); + else if (NumBytes) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII, + MachineInstr::FrameDestroy); + + // This must be placed after the callee-save restore code because that code + // assumes the SP is at the same location as it was after the callee-save save + // code in the prologue. + if (ArgumentPopSize) + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + ArgumentPopSize, TII, MachineInstr::FrameDestroy); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -726,86 +852,167 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { - if (Reg != AArch64::LR) - return getKillRegState(true); + // Do not set a kill flag on values that are also marked as live-in. This + // happens with the @llvm-returnaddress intrinsic and with arguments passed in + // callee saved registers. + // Omitting the kill flags is conservatively correct even if the live-in + // is not used after all. + bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg); + return getKillRegState(!IsLiveIn); +} - // LR maybe referred to later by an @llvm.returnaddress intrinsic. - bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR); - bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken()); - return getKillRegState(LRKill); +static bool produceCompactUnwindFrame(MachineFunction &MF) { + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + AttributeSet Attrs = MF.getFunction()->getAttributes(); + return Subtarget.isTargetMachO() && + !(Subtarget.getTargetLowering()->supportSwiftError() && + Attrs.hasAttrSomewhere(Attribute::SwiftError)); } -bool AArch64FrameLowering::spillCalleeSavedRegisters( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { - MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + +struct RegPairInfo { + RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {} + unsigned Reg1; + unsigned Reg2; + int FrameIdx; + int Offset; + bool IsGPR; + bool isPaired() const { return Reg2 != AArch64::NoRegister; } +}; + +static void computeCalleeSaveRegisterPairs( + MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) { + + if (CSI.empty()) + return; + + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + CallingConv::ID CC = MF.getFunction()->getCallingConv(); unsigned Count = CSI.size(); - DebugLoc DL; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + (void)CC; + // MachO's compact unwind format relies on all registers being stored in + // pairs. + assert((!produceCompactUnwindFrame(MF) || + CC == CallingConv::PreserveMost || + (Count & 1) == 0) && + "Odd number of callee-saved regs to spill!"); + unsigned Offset = AFI->getCalleeSavedStackSize(); + + for (unsigned i = 0; i < Count; ++i) { + RegPairInfo RPI; + RPI.Reg1 = CSI[i].getReg(); + + assert(AArch64::GPR64RegClass.contains(RPI.Reg1) || + AArch64::FPR64RegClass.contains(RPI.Reg1)); + RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1); + + // Add the next reg to the pair if it is in the same register class. + if (i + 1 < Count) { + unsigned NextReg = CSI[i + 1].getReg(); + if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) || + (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg))) + RPI.Reg2 = NextReg; + } - for (unsigned i = 0; i < Count; i += 2) { - unsigned idx = Count - i - 2; - unsigned Reg1 = CSI[idx].getReg(); - unsigned Reg2 = CSI[idx + 1].getReg(); // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI // list to come in sorted by frame index so that we can issue the store // pair instructions directly. Assert if we see anything otherwise. // // The order of the registers in the list is controlled by // getCalleeSavedRegs(), so they will always be in-order, as well. - assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() && + assert((!RPI.isPaired() || + (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) && "Out of order callee saved regs!"); + + // MachO's compact unwind format relies on all registers being stored in + // adjacent register pairs. + assert((!produceCompactUnwindFrame(MF) || + CC == CallingConv::PreserveMost || + (RPI.isPaired() && + ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) || + RPI.Reg1 + 1 == RPI.Reg2))) && + "Callee-save registers not saved as adjacent register pair!"); + + RPI.FrameIdx = CSI[i].getFrameIdx(); + + if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) { + // Round up size of non-pair to pair size if we need to pad the + // callee-save area to ensure 16-byte alignment. + Offset -= 16; + assert(MFI->getObjectAlignment(RPI.FrameIdx) <= 16); + MFI->setObjectAlignment(RPI.FrameIdx, 16); + AFI->setCalleeSaveStackHasFreeSpace(true); + } else + Offset -= RPI.isPaired() ? 16 : 8; + assert(Offset % 8 == 0); + RPI.Offset = Offset / 8; + assert((RPI.Offset >= -64 && RPI.Offset <= 63) && + "Offset out of bounds for LDP/STP immediate"); + + RegPairs.push_back(RPI); + if (RPI.isPaired()) + ++i; + } +} + +bool AArch64FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + DebugLoc DL; + SmallVector<RegPairInfo, 8> RegPairs; + + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + + for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; + ++RPII) { + RegPairInfo RPI = *RPII; + unsigned Reg1 = RPI.Reg1; + unsigned Reg2 = RPI.Reg2; unsigned StrOpc; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); - // Issue sequence of non-sp increment and pi sp spills for cs regs. The - // first spill is a pre-increment that allocates the stack. + + // Issue sequence of spills for cs regs. The first spill may be converted + // to a pre-decrement store later by emitPrologue if the callee-save stack + // area allocation can't be combined with the local stack area allocation. // For example: - // stp x22, x21, [sp, #-48]! // addImm(-6) + // stp x22, x21, [sp, #0] // addImm(+0) // stp x20, x19, [sp, #16] // addImm(+2) // stp fp, lr, [sp, #32] // addImm(+4) // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! - // Note: Similar rational and sequence for restores in epilog. - if (AArch64::GPR64RegClass.contains(Reg1)) { - assert(AArch64::GPR64RegClass.contains(Reg2) && - "Expected GPR64 callee-saved register pair!"); - // For first spill use pre-increment store. - if (i == 0) - StrOpc = AArch64::STPXpre; - else - StrOpc = AArch64::STPXi; - } else if (AArch64::FPR64RegClass.contains(Reg1)) { - assert(AArch64::FPR64RegClass.contains(Reg2) && - "Expected FPR64 callee-saved register pair!"); - // For first spill use pre-increment store. - if (i == 0) - StrOpc = AArch64::STPDpre; - else - StrOpc = AArch64::STPDi; - } else - llvm_unreachable("Unexpected callee saved register!"); - DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", " - << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx() - << ", " << CSI[idx + 1].getFrameIdx() << ")\n"); - // Compute offset: i = 0 => offset = -Count; - // i = 2 => offset = -(Count - 2) + Count = 2 = i; etc. - const int Offset = (i == 0) ? -Count : i; - assert((Offset >= -64 && Offset <= 63) && - "Offset out of bounds for STP immediate"); - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre) - MIB.addReg(AArch64::SP, RegState::Define); + // Note: Similar rationale and sequence for restores in epilog. + if (RPI.IsGPR) + StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; + else + StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; + DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1); + if (RPI.isPaired()) + dbgs() << ", " << TRI->getName(Reg2); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx+1; + dbgs() << ")\n"); + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); MBB.addLiveIn(Reg1); - MBB.addLiveIn(Reg2); - MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)) - .addReg(Reg1, getPrologueDeath(MF, Reg1)) + if (RPI.isPaired()) { + MBB.addLiveIn(Reg2); + MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), + MachineMemOperand::MOStore, 8, 8)); + } + MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) .addReg(AArch64::SP) - .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit + .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit .setMIFlag(MachineInstr::FrameSetup); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), + MachineMemOperand::MOStore, 8, 8)); } return true; } @@ -816,66 +1023,55 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - unsigned Count = CSI.size(); DebugLoc DL; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + SmallVector<RegPairInfo, 8> RegPairs; if (MI != MBB.end()) DL = MI->getDebugLoc(); - for (unsigned i = 0; i < Count; i += 2) { - unsigned Reg1 = CSI[i].getReg(); - unsigned Reg2 = CSI[i + 1].getReg(); - // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI - // list to come in sorted by frame index so that we can issue the store - // pair instructions directly. Assert if we see anything otherwise. - assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() && - "Out of order callee saved regs!"); - // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only - // the last load is sp-pi post-increment and de-allocates the stack: + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + + for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE; + ++RPII) { + RegPairInfo RPI = *RPII; + unsigned Reg1 = RPI.Reg1; + unsigned Reg2 = RPI.Reg2; + + // Issue sequence of restores for cs regs. The last restore may be converted + // to a post-increment load later by emitEpilogue if the callee-save stack + // area allocation can't be combined with the local stack area allocation. // For example: // ldp fp, lr, [sp, #32] // addImm(+4) // ldp x20, x19, [sp, #16] // addImm(+2) - // ldp x22, x21, [sp], #48 // addImm(+6) + // ldp x22, x21, [sp, #0] // addImm(+0) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; + if (RPI.IsGPR) + LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; + else + LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; + DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1); + if (RPI.isPaired()) + dbgs() << ", " << TRI->getName(Reg2); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx+1; + dbgs() << ")\n"); - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); - if (AArch64::GPR64RegClass.contains(Reg1)) { - assert(AArch64::GPR64RegClass.contains(Reg2) && - "Expected GPR64 callee-saved register pair!"); - if (i == Count - 2) - LdrOpc = AArch64::LDPXpost; - else - LdrOpc = AArch64::LDPXi; - } else if (AArch64::FPR64RegClass.contains(Reg1)) { - assert(AArch64::FPR64RegClass.contains(Reg2) && - "Expected FPR64 callee-saved register pair!"); - if (i == Count - 2) - LdrOpc = AArch64::LDPDpost; - else - LdrOpc = AArch64::LDPDi; - } else - llvm_unreachable("Unexpected callee saved register!"); - DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", " - << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx() - << ", " << CSI[i + 1].getFrameIdx() << ")\n"); - - // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4; - // etc. - const int Offset = (i == Count - 2) ? Count : Count - i - 2; - assert((Offset >= -64 && Offset <= 63) && - "Offset out of bounds for LDP immediate"); MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); - if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost) - MIB.addReg(AArch64::SP, RegState::Define); - - MIB.addReg(Reg2, getDefRegState(true)) - .addReg(Reg1, getDefRegState(true)) + if (RPI.isPaired()) { + MIB.addReg(Reg2, getDefRegState(true)); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), + MachineMemOperand::MOLoad, 8, 8)); + } + MIB.addReg(Reg1, getDefRegState(true)) .addReg(AArch64::SP) - .addImm(Offset); // [sp], #offset * 8 or [sp, #offset * 8] - // where the factor * 8 is implicit + .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit + .setMIFlag(MachineInstr::FrameDestroy); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), + MachineMemOperand::MOLoad, 8, 8)); } return true; } @@ -892,8 +1088,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - SmallVector<unsigned, 4> UnspilledCSGPRs; - SmallVector<unsigned, 4> UnspilledCSFPRs; + unsigned UnspilledCSGPR = AArch64::NoRegister; + unsigned UnspilledCSGPRPaired = AArch64::NoRegister; // The frame record needs to be created by saving the appropriate registers if (hasFP(MF)) { @@ -901,79 +1097,51 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(AArch64::LR); } - // Spill the BasePtr if it's used. Do this first thing so that the - // getCalleeSavedRegs() below will get the right answer. + unsigned BasePointerReg = AArch64::NoRegister; if (RegInfo->hasBasePointer(MF)) - SavedRegs.set(RegInfo->getBaseRegister()); - - if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF)) - SavedRegs.set(AArch64::X9); + BasePointerReg = RegInfo->getBaseRegister(); - // If any callee-saved registers are used, the frame cannot be eliminated. - unsigned NumGPRSpilled = 0; - unsigned NumFPRSpilled = 0; bool ExtraCSSpill = false; - bool CanEliminateFrame = true; - DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + // Figure out which callee-saved registers to save/restore. + for (unsigned i = 0; CSRegs[i]; ++i) { + const unsigned Reg = CSRegs[i]; + + // Add the base pointer register to SavedRegs if it is callee-save. + if (Reg == BasePointerReg) + SavedRegs.set(Reg); - // Check pairs of consecutive callee-saved registers. - for (unsigned i = 0; CSRegs[i]; i += 2) { - assert(CSRegs[i + 1] && "Odd number of callee-saved registers!"); - - const unsigned OddReg = CSRegs[i]; - const unsigned EvenReg = CSRegs[i + 1]; - assert((AArch64::GPR64RegClass.contains(OddReg) && - AArch64::GPR64RegClass.contains(EvenReg)) ^ - (AArch64::FPR64RegClass.contains(OddReg) && - AArch64::FPR64RegClass.contains(EvenReg)) && - "Register class mismatch!"); - - const bool OddRegUsed = SavedRegs.test(OddReg); - const bool EvenRegUsed = SavedRegs.test(EvenReg); - - // Early exit if none of the registers in the register pair is actually - // used. - if (!OddRegUsed && !EvenRegUsed) { - if (AArch64::GPR64RegClass.contains(OddReg)) { - UnspilledCSGPRs.push_back(OddReg); - UnspilledCSGPRs.push_back(EvenReg); - } else { - UnspilledCSFPRs.push_back(OddReg); - UnspilledCSFPRs.push_back(EvenReg); + bool RegUsed = SavedRegs.test(Reg); + unsigned PairedReg = CSRegs[i ^ 1]; + if (!RegUsed) { + if (AArch64::GPR64RegClass.contains(Reg) && + !RegInfo->isReservedReg(MF, Reg)) { + UnspilledCSGPR = Reg; + UnspilledCSGPRPaired = PairedReg; } continue; } - unsigned Reg = AArch64::NoRegister; - // If only one of the registers of the register pair is used, make sure to - // mark the other one as used as well. - if (OddRegUsed ^ EvenRegUsed) { - // Find out which register is the additional spill. - Reg = OddRegUsed ? EvenReg : OddReg; - SavedRegs.set(Reg); + // MachO's compact unwind format relies on all registers being stored in + // pairs. + // FIXME: the usual format is actually better if unwinding isn't needed. + if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) { + SavedRegs.set(PairedReg); + if (AArch64::GPR64RegClass.contains(PairedReg) && + !RegInfo->isReservedReg(MF, PairedReg)) + ExtraCSSpill = true; } + } - DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo)); - DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo)); - - assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) || - (RegInfo->getEncodingValue(OddReg) + 1 == - RegInfo->getEncodingValue(EvenReg))) && - "Register pair of non-adjacent registers!"); - if (AArch64::GPR64RegClass.contains(OddReg)) { - NumGPRSpilled += 2; - // If it's not a reserved register, we can use it in lieu of an - // emergency spill slot for the register scavenger. - // FIXME: It would be better to instead keep looking and choose another - // unspilled register that isn't reserved, if there is one. - if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg)) - ExtraCSSpill = true; - } else - NumFPRSpilled += 2; + DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; + for (int Reg = SavedRegs.find_first(); Reg != -1; + Reg = SavedRegs.find_next(Reg)) + dbgs() << ' ' << PrintReg(Reg, RegInfo); + dbgs() << "\n";); - CanEliminateFrame = false; - } + // If any callee-saved registers are used, the frame cannot be eliminated. + unsigned NumRegsSpilled = SavedRegs.count(); + bool CanEliminateFrame = NumRegsSpilled == 0; // FIXME: Set BigStack if any stack slot references may be out of range. // For now, just conservatively guestimate based on unscaled indexing @@ -982,8 +1150,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. MachineFrameInfo *MFI = MF.getFrameInfo(); - unsigned CFSize = - MFI->estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled); + unsigned CFSize = MFI->estimateStackSize(MF) + 8 * NumRegsSpilled; DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); bool BigStack = (CFSize >= 256); if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) @@ -996,19 +1163,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // above to keep the number of spills even, we don't need to do anything else // here. if (BigStack && !ExtraCSSpill) { - - // If we're adding a register to spill here, we have to add two of them - // to keep the number of regs to spill even. - assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!"); - unsigned Count = 0; - while (!UnspilledCSGPRs.empty() && Count < 2) { - unsigned Reg = UnspilledCSGPRs.back(); - UnspilledCSGPRs.pop_back(); - DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo) - << " to get a scratch register.\n"); - SavedRegs.set(Reg); + if (UnspilledCSGPR != AArch64::NoRegister) { + DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo) + << " to get a scratch register.\n"); + SavedRegs.set(UnspilledCSGPR); + // MachO's compact unwind format relies on all registers being stored in + // pairs, so if we need to spill one extra for BigStack, then we need to + // store the pair. + if (produceCompactUnwindFrame(MF)) + SavedRegs.set(UnspilledCSGPRPaired); ExtraCSSpill = true; - ++Count; + NumRegsSpilled = SavedRegs.count(); } // If we didn't find an extra callee-saved register to spill, create @@ -1021,4 +1186,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, << " as the emergency spill slot.\n"); } } + + // Round up to register pair alignment to avoid additional SP adjustment + // instructions. + AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16)); +} + +bool AArch64FrameLowering::enableStackSlotScavenging( + const MachineFunction &MF) const { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return AFI->hasCalleeSaveStackFreeSpace(); } |